diff --git "a/log_node0.txt" "b/log_node0.txt"
new file mode 100644--- /dev/null
+++ "b/log_node0.txt"
@@ -0,0 +1,20471 @@
++ echo Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node0.txt
+Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node0.txt
++ export ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0
++ ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0
++ mkdir -p /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0
++ DATA_PATH=/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml
++ TOKENIZER_PATH=/data_4/models/Qwen/Qwen2.5-14B-Instruct/
++ CKPT_LOAD_DIR=/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/
++ VIT_CKPT_LOAD_DIR=/
++ CKPT_SAVE_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//
++ rsync -avh /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/
+sending incremental file list
+
+sent 71 bytes  received 12 bytes  166.00 bytes/sec
+total size is 23.84K  speedup is 287.17
++ cd /local_disk/cognitron_vl/
++ rm -fr datasets
++ mkdir -p datasets
++ ln -s /data/data/ datasets/CV
++ ln -s /data/data/LLM datasets/LLM
++ ln -s /data/data/LMM datasets/LMM
++ source /local_disk/cognitron_vl//scripts/set_env_mg_npu.sh
+++ source /usr/local/Ascend/driver/bin/setenv.bash
++++ DEP_INFO_FILE=/etc/ascend_install.info
++++ [[ -f /etc/ascend_install.info ]]
++++ . /etc/ascend_install.info
++++ DRV_LIB64_COMMON_LDPATH=/driver/lib64/common
++++ DRV_LIB64_DRV_LDPATH=/driver/lib64/driver
++++ DRV_LIB64_LDPATH=/driver/lib64
++++ export LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
++++ LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
++++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
++++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ source /usr/local/Ascend/ascend-toolkit/set_env.sh
++++ export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
++++ LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
++++ export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
++++ ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+++++ arch
++++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
++++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
++++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
++++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
++++ export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
++++ PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
++++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
++++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
++++ export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
++++ ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
++++ export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
++++ ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
++++ export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
++++ TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
++++ export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
++++ ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ export HCCL_CONNECT_TIMEOUT=7200
+++ HCCL_CONNECT_TIMEOUT=7200
+++ export HCCL_EXEC_TIMEOUT=7200
+++ HCCL_EXEC_TIMEOUT=7200
+++ export COMBINED_ENABLE=1
+++ COMBINED_ENABLE=1
+++ export MULTI_STREAM_MEMORY_REUSE=1
+++ MULTI_STREAM_MEMORY_REUSE=1
+++ export HCCL_RDMA_TC=160
+++ HCCL_RDMA_TC=160
+++ export HCCL_RDMA_SL=5
+++ HCCL_RDMA_SL=5
+++ export HCCL_INTRA_PCIE_ENABLE=0
+++ HCCL_INTRA_PCIE_ENABLE=0
+++ export HCCL_INTRA_ROCE_ENABLE=1
+++ HCCL_INTRA_ROCE_ENABLE=1
+++ export HCCL_RDMA_TIMEOUT=20
+++ HCCL_RDMA_TIMEOUT=20
+++ export INF_NAN_MODE_ENABLE=1
+++ INF_NAN_MODE_ENABLE=1
+++ export DISTRIBUTED_BACKEND=hccl
+++ DISTRIBUTED_BACKEND=hccl
+++ export ASCEND_LAUNCH_BLOCKING=0
+++ ASCEND_LAUNCH_BLOCKING=0
+++ export ASCEND_SLOG_PRINT_TO_STDOUT=0
+++ ASCEND_SLOG_PRINT_TO_STDOUT=0
+++ export ASCEND_GLOBAL_LOG_LEVEL=3
+++ ASCEND_GLOBAL_LOG_LEVEL=3
+++ export ASCEND_GLOBAL_EVENT_ENABLE=0
+++ ASCEND_GLOBAL_EVENT_ENABLE=0
+++ export TASK_QUEUE_ENABLE=1
+++ TASK_QUEUE_ENABLE=1
+++ export PTCOPY_ENABLE=1
+++ PTCOPY_ENABLE=1
+++ export COMBINED_ENABLE=1
+++ COMBINED_ENABLE=1
+++ export DYNAMIC_OP=ADD#MUL
+++ DYNAMIC_OP=ADD#MUL
+++ export HCCL_WHITELIST_DISABLE=1
+++ HCCL_WHITELIST_DISABLE=1
+++ export HCCL_CONNECT_TIMEOUT=7200
+++ HCCL_CONNECT_TIMEOUT=7200
+++ export HCCL_WHITELIST_DISABLE=1
+++ HCCL_WHITELIST_DISABLE=1
+++ export CUDA_DEVICE_MAX_CONNECTIONS=1
+++ CUDA_DEVICE_MAX_CONNECTIONS=1
+++ pip3 install --no-index --find-links=/data/software/ -r requirements_npu.txt
+Looking in links: /data/software/
+Processing data/software/expecttest-0.2.1-py3-none-any.whl (from -r requirements_npu.txt (line 1))
+Requirement already satisfied: peft in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 2)) (0.7.0)
+Processing data/software/XlsxWriter-3.2.0-py3-none-any.whl (from -r requirements_npu.txt (line 3))
+Requirement already satisfied: termcolor in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 4)) (2.4.0)
+Requirement already satisfied: tabulate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 5)) (0.9.0)
+Processing data/software/tiktoken-0.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 6))
+Requirement already satisfied: matplotlib in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 7)) (3.7.5)
+Processing data/software/datasets-3.0.0-py3-none-any.whl (from -r requirements_npu.txt (line 8))
+Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 9)) (0.7.0)
+Processing data/software/pybind11-2.13.6-py3-none-any.whl (from -r requirements_npu.txt (line 10))
+Requirement already satisfied: tensorboardX in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 11)) (2.6.2.2)
+Processing data/software/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 12))
+Requirement already satisfied: transformers>=4.40.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 13)) (4.40.1)
+Requirement already satisfied: deepspeed>=0.14.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 14)) (0.14.5)
+Processing data/software/accelerate-0.34.2-py3-none-any.whl (from -r requirements_npu.txt (line 15))
+Requirement already satisfied: timm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 16)) (0.9.16)
+Processing data/software/flask-3.0.3-py3-none-any.whl (from -r requirements_npu.txt (line 17))
+Processing data/software/Flask_RESTful-0.3.10-py2.py3-none-any.whl (from -r requirements_npu.txt (line 18))
+Processing data/software/decord-0.6.0-py3-none-manylinux2010_x86_64.whl (from -r requirements_npu.txt (line 19))
+Processing data/software/natsort-8.4.0-py3-none-any.whl (from -r requirements_npu.txt (line 20))
+Requirement already satisfied: numpy>=1.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (1.24.4)
+Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (23.2)
+Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.9.8)
+Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.4.1)
+Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (2.1.0+cpu)
+Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (4.66.2)
+Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.4.2)
+Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.20.3)
+Requirement already satisfied: regex>=2022.1.18 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2023.12.25)
+Requirement already satisfied: requests>=2.26.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2.31.0)
+Requirement already satisfied: contourpy>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.1.1)
+Requirement already satisfied: cycler>=0.10 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (0.12.1)
+Requirement already satisfied: fonttools>=4.22.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (4.49.0)
+Requirement already satisfied: kiwisolver>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.4.5)
+Requirement already satisfied: pillow>=6.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (10.2.0)
+Requirement already satisfied: pyparsing>=2.3.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (3.1.1)
+Requirement already satisfied: python-dateutil>=2.7 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (2.8.2)
+Requirement already satisfied: importlib-resources>=3.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (6.1.2)
+Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.13.1)
+Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.3.7)
+Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2.0.3)
+Processing data/software/requests-2.32.3-py3-none-any.whl (from tiktoken->-r requirements_npu.txt (line 6))
+Processing data/software/tqdm-4.67.1-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2))
+Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.4.1)
+Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.70.15)
+Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2023.10.0)
+Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.9.3)
+Processing data/software/huggingface_hub-0.26.2-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2))
+Requirement already satisfied: protobuf>=3.20 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tensorboardX->-r requirements_npu.txt (line 11)) (4.25.3)
+Requirement already satisfied: tokenizers<0.20,>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers>=4.40.1->-r requirements_npu.txt (line 13)) (0.19.1)
+Requirement already satisfied: hjson in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (3.1.0)
+Requirement already satisfied: ninja in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.11.1.1)
+Requirement already satisfied: nvidia-ml-py in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (12.560.30)
+Requirement already satisfied: py-cpuinfo in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (9.0.0)
+Requirement already satisfied: pydantic in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.10.15)
+Processing data/software/safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from peft->-r requirements_npu.txt (line 2))
+Requirement already satisfied: torchvision in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from timm->-r requirements_npu.txt (line 16)) (0.16.0)
+Requirement already satisfied: Werkzeug>=3.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.0.1)
+Requirement already satisfied: Jinja2>=3.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.1.3)
+Processing data/software/itsdangerous-2.2.0-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17))
+Requirement already satisfied: click>=8.1.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (8.1.7)
+Processing data/software/blinker-1.8.2-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17))
+Requirement already satisfied: importlib-metadata>=3.6.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (7.0.1)
+Processing data/software/aniso8601-9.0.1-py2.py3-none-any.whl (from flask_restful->-r requirements_npu.txt (line 18))
+Requirement already satisfied: six>=1.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (1.16.0)
+Requirement already satisfied: pytz in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (2024.1)
+Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.3.1)
+Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (23.2.0)
+Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.4.1)
+Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (6.0.5)
+Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.9.4)
+Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (4.0.3)
+Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft->-r requirements_npu.txt (line 2)) (4.10.0)
+Requirement already satisfied: zipp>=0.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from importlib-metadata>=3.6.0->flask->-r requirements_npu.txt (line 17)) (3.17.0)
+Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from Jinja2>=3.1.2->flask->-r requirements_npu.txt (line 17)) (2.1.5)
+Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.3.2)
+Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.6)
+Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (1.26.18)
+Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (2024.2.2)
+Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.4)
+Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (3.1)
+Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2024.1)
+Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.3.0)
+DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
+Installing collected packages: aniso8601, xlsxwriter, tqdm, safetensors, requests, pybind11, pyarrow, natsort, itsdangerous, expecttest, decord, blinker, tiktoken, huggingface-hub, flask, flask_restful, accelerate, datasets
+  Attempting uninstall: tqdm
+    Found existing installation: tqdm 4.66.2
+    Uninstalling tqdm-4.66.2:
+      Successfully uninstalled tqdm-4.66.2
+  Attempting uninstall: safetensors
+    Found existing installation: safetensors 0.4.2
+    Uninstalling safetensors-0.4.2:
+      Successfully uninstalled safetensors-0.4.2
+  Attempting uninstall: requests
+    Found existing installation: requests 2.31.0
+    Uninstalling requests-2.31.0:
+      Successfully uninstalled requests-2.31.0
+  Attempting uninstall: pyarrow
+    Found existing installation: pyarrow 15.0.0
+    Uninstalling pyarrow-15.0.0:
+      Successfully uninstalled pyarrow-15.0.0
+  Attempting uninstall: huggingface-hub
+    Found existing installation: huggingface-hub 0.20.3
+    Uninstalling huggingface-hub-0.20.3:
+      Successfully uninstalled huggingface-hub-0.20.3
+  Attempting uninstall: accelerate
+    Found existing installation: accelerate 0.25.0
+    Uninstalling accelerate-0.25.0:
+      Successfully uninstalled accelerate-0.25.0
+  Attempting uninstall: datasets
+    Found existing installation: datasets 2.16.0
+    Uninstalling datasets-2.16.0:
+      Successfully uninstalled datasets-2.16.0
+ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
+tikit 1.8.2.240926 requires dicttoxml==1.7.4, which is not installed.
+tikit 1.8.2.240926 requires docopt==0.6.2, which is not installed.
+tikit 1.8.2.240926 requires future==0.18.2, which is not installed.
+tikit 1.8.2.240926 requires hdfs==2.6.0, which is not installed.
+tikit 1.8.2.240926 requires pure-sasl==0.6.2, which is not installed.
+tikit 1.8.2.240926 requires py4j==0.10.7, which is not installed.
+tikit 1.8.2.240926 requires PyHive[hive]==0.6.4, which is not installed.
+tikit 1.8.2.240926 requires pyjwt>=2.4.0, which is not installed.
+tikit 1.8.2.240926 requires requests-kerberos>=0.14.0, which is not installed.
+tikit 1.8.2.240926 requires sasl==0.3.1, which is not installed.
+tikit 1.8.2.240926 requires thrift==0.15.0, which is not installed.
+tikit 1.8.2.240926 requires thrift-sasl>=0.1.0, which is not installed.
+tikit 1.8.2.240926 requires certifi==2021.10.8, but you have certifi 2024.2.2 which is incompatible.
+tikit 1.8.2.240926 requires cos-python-sdk-v5==1.9.29, but you have cos-python-sdk-v5 1.9.26 which is incompatible.
+tikit 1.8.2.240926 requires idna==3.3, but you have idna 3.6 which is incompatible.
+tikit 1.8.2.240926 requires prettytable==2.5.0, but you have prettytable 3.11.0 which is incompatible.
+tikit 1.8.2.240926 requires urllib3==1.26.7, but you have urllib3 1.26.18 which is incompatible.
+tikit 1.8.2.240926 requires wcwidth==0.2.5, but you have wcwidth 0.2.13 which is incompatible.
+Successfully installed accelerate-0.34.2 aniso8601-9.0.1 blinker-1.8.2 datasets-3.0.0 decord-0.6.0 expecttest-0.2.1 flask-3.0.3 flask_restful-0.3.10 huggingface-hub-0.26.2 itsdangerous-2.2.0 natsort-8.4.0 pyarrow-17.0.0 pybind11-2.13.6 requests-2.32.3 safetensors-0.4.5 tiktoken-0.7.0 tqdm-4.67.1 xlsxwriter-3.2.0
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+++ return 0
++ MEGATRON_DIR=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/
++ MINDSPEED_DIR=/local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/
++ MODELLINK_DIR=/local_disk/cognitron_vl//third_party/ModelLink/
++ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/
+Looking in links: /data/software/
+Obtaining file://local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0
+  Installing build dependencies: started
+  Installing build dependencies: finished with status 'done'
+  Checking if build backend supports build_editable: started
+  Checking if build backend supports build_editable: finished with status 'done'
+  Getting requirements to build editable: started
+  Getting requirements to build editable: finished with status 'done'
+  Installing backend dependencies: started
+  Installing backend dependencies: finished with status 'done'
+  Preparing editable metadata (pyproject.toml): started
+  Preparing editable metadata (pyproject.toml): finished with status 'done'
+Building wheels for collected packages: megatron_core
+  Building editable for megatron_core (pyproject.toml): started
+  Building editable for megatron_core (pyproject.toml): finished with status 'done'
+  Created wheel for megatron_core: filename=megatron_core-0.6.0-0.editable-cp38-cp38-linux_x86_64.whl size=8791 sha256=06d5bd071b6eadb2bc6965a495bd802172dae415af74dd60b1478328d6910bcd
+  Stored in directory: /tmp/pip-ephem-wheel-cache-m7mtfhay/wheels/54/9c/d1/d2015aa0c34e791e64d65d19395e5a9a5528f0c63fd519b9ff
+Successfully built megatron_core
+DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
+Installing collected packages: megatron_core
+Successfully installed megatron_core-0.6.0
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
++ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/
+Looking in links: /data/software/
+Obtaining file://local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0
+  Preparing metadata (setup.py): started
+  Preparing metadata (setup.py): finished with status 'done'
+WARNING: Error parsing requirements for tokenizers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/tokenizers-0.19.1.dist-info/METADATA'
+WARNING: Error parsing requirements for transformers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/transformers-4.40.1.dist-info/METADATA'
+DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
+Installing collected packages: mindspeed
+  Running setup.py develop for mindspeed
+Successfully installed mindspeed-0.6.0
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
++ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/ModelLink/
+Looking in links: /data/software/
+Obtaining file://local_disk/cognitron_vl/third_party/ModelLink
+  Preparing metadata (setup.py): started
+  Preparing metadata (setup.py): finished with status 'done'
+Requirement already satisfied: numpy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.24.4)
+Processing data/software/transformers-4.43.2-py3-none-any.whl (from modellink==0.0.1)
+Processing data/software/transformers-stream-generator-0.0.5.tar.gz (from modellink==0.0.1)
+  Preparing metadata (setup.py): started
+  Preparing metadata (setup.py): finished with status 'done'
+Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.4)
+Requirement already satisfied: decorator in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (5.1.1)
+Requirement already satisfied: scipy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.10.1)
+Requirement already satisfied: sentencepiece in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.2.0)
+Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0)
+Requirement already satisfied: datasets in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (3.0.0)
+Requirement already satisfied: pybind11 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (2.13.6)
+Requirement already satisfied: accelerate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.34.2)
+Requirement already satisfied: six in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.16.0)
+Requirement already satisfied: protobuf in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (4.25.3)
+Processing data/software/peft-0.7.1-py3-none-any.whl (from modellink==0.0.1)
+Requirement already satisfied: tiktoken in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0)
+Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (23.2)
+Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.9.8)
+Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.4.1)
+Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (2.1.0+cpu)
+Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (4.67.1)
+Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.4.5)
+Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.26.2)
+Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (3.13.1)
+Requirement already satisfied: regex!=2019.12.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2023.12.25)
+Requirement already satisfied: requests in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2.32.3)
+Processing data/software/tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from transformers==4.43.2->modellink==0.0.1)
+Requirement already satisfied: pyarrow>=15.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (17.0.0)
+Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.3.7)
+Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (2.0.3)
+Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.4.1)
+Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.70.15)
+Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets->modellink==0.0.1) (2023.10.0)
+Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.9.3)
+Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->modellink==0.0.1) (1.3.0)
+Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.3.1)
+Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (23.2.0)
+Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.4.1)
+Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (6.0.5)
+Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.9.4)
+Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (4.0.3)
+Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft==0.7.1->modellink==0.0.1) (4.10.0)
+Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.3.2)
+Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.6)
+Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (1.26.18)
+Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (2024.2.2)
+Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1)
+Requirement already satisfied: jinja2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1.3)
+Requirement already satisfied: python-dateutil>=2.8.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2.8.2)
+Requirement already satisfied: pytz>=2020.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1)
+Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1)
+Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from jinja2->torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (2.1.5)
+Building wheels for collected packages: transformers_stream_generator
+  Building wheel for transformers_stream_generator (setup.py): started
+  Building wheel for transformers_stream_generator (setup.py): finished with status 'done'
+  Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=3ed62a866ab10917ceed94a0bafc0596380802f798ed67b7de78b76fe0b65f1f
+  Stored in directory: /root/.cache/pip/wheels/56/8c/42/5381d9c36bc85f28982f4cf8f98dc44d37a6d6c04897a5cb7c
+Successfully built transformers_stream_generator
+DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
+Installing collected packages: tokenizers, transformers, transformers_stream_generator, peft, modellink
+  Attempting uninstall: tokenizers
+    Found existing installation: tokenizers 0.20.3
+    Uninstalling tokenizers-0.20.3:
+      Successfully uninstalled tokenizers-0.20.3
+  Attempting uninstall: transformers
+    Found existing installation: transformers 4.46.3
+    Uninstalling transformers-4.46.3:
+      Successfully uninstalled transformers-4.46.3
+  Attempting uninstall: peft
+    Found existing installation: peft 0.7.0
+    Uninstalling peft-0.7.0:
+      Successfully uninstalled peft-0.7.0
+  Running setup.py develop for modellink
+Successfully installed modellink-0.0.1 peft-0.7.1 tokenizers-0.19.1 transformers-4.43.2 transformers_stream_generator-0.0.5
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
++ export PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
++ PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
++ GPUS_PER_NODE=16
++ NNODES=32
++ NODE_RANK=0
++ MASTER_PORT=34567
++ export CUDA_DEVICE_MAX_CONNECTIONS=1
++ CUDA_DEVICE_MAX_CONNECTIONS=1
++ VISION_SEQ_LENGTH=1025
++ IMAGE_TOKEN_LENGTH=256
++ IMAGE_SIZE=448
++ VISION_MODEL_TYPE=intern_300m
++ TP=8
++ PP=1
++ CP=2
++ CP_ALGO=megatron_cp_algo
++ CP_MASK=causal
++ DISTRIBUTED_ARGS='
+    --nproc_per_node 16     --nnodes 32     --node_rank 0     --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local     --master_port 34567
+'
++ GPT_ARGS='
+    --use-mcore-models     --tensor-model-parallel-size 8     --pipeline-model-parallel-size 1     --context-parallel-size 2     --context-parallel-algo megatron_cp_algo     --cp-attention-mask-type causal     --use-cp-send-recv-overlap     --no-create-attention-mask-in-dataloader     --sparse-mode 4     --sequence-parallel     --recompute-method block     --recompute-granularity full     --recompute-num-layers 48     --num-layers 48     --hidden-size 5120     --ffn-hidden-size 13824     --num-attention-heads 40     --group-query-attention     --num-query-groups 8     --tokenizer-type PretrainedFromHF     --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/     --seq-length 131072     --max-position-embeddings 131072     --micro-batch-size 1     --global-batch-size 64     --make-vocab-size-divisible-by 1     --padded-vocab-size 152064     --rotary-base 1000000.0     --lr 5.00e-6     --train-iters 1000     --lr-decay-style cosine     --untie-embeddings-and-output-weights     --disable-bias-linear     --attention-dropout 0.0     --init-method-std 0.01     --hidden-dropout 0.0     --position-embedding-type rope     --normalization RMSNorm     --use-fused-rmsnorm     --norm-epsilon 1e-6     --swiglu     --use-flash-attn     --use-fused-rotary-pos-emb     --use-rotary-position-embeddings     --use-fused-swiglu     --use-mc2     --no-masked-softmax-fusion     --attention-softmax-in-fp32     --min-lr 1.00e-7     --weight-decay 0.0     --lr-warmup-fraction 0.03     --clip-grad 1.0     --adam-beta1 0.9     --adam-beta2 0.999     --add-qkv-bias     --initial-loss-scale 4096     --no-gradient-accumulation-fusion     --use-distributed-optimizer     --bf16     --overlap-grad-reduce     --finetune     --vision-model-freeze     --vision-model-type intern_300m     --vision-downsample-ratio 0.5     --vision-projector-type mlp     --vision-projector-pre-norm     --vision-process-type dynamic     --vision-normalize-type imagenet     --vision-seq-length 1025     --image-token-length 256     --image-size 448     --prompt-format qwen2     --is-instruction-dataset     --max-num-frame 512     --max-fps 1     --add-class-token     --min-patch-grid 1     --max-patch-grid 12     --cross-dataset-joint '
++ DATA_ARGS='
+    --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml     --split 100,0,0     --data-seq-length 131072     --num-workers 8 '
++ CKPT_ARGS='
+    --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/     --vit-load /     --no-load-optim     --no-load-rng     --seed 424242     --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// '
++ OUTPUT_ARGS='
+    --log-interval 1     --save-interval 100     --eval-interval 100     --eval-iters 0     --log-throughput     --distributed-timeout-minutes 120 '
++ torchrun --nproc_per_node 16 --nnodes 32 --node_rank 0 --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local --master_port 34567 /local_disk/cognitron_vl//lcvlm_modellink/pretrain_lcvlm.py --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 131072 --max-position-embeddings 131072 --micro-batch-size 1 --global-batch-size 64 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 1000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-frame 512 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --cross-dataset-joint --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml --split 100,0,0 --data-seq-length 131072 --num-workers 8 --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ --vit-load / --no-load-optim --no-load-rng --seed 424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// --log-interval 1 --save-interval 100 --eval-interval 100 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 --distributed-backend nccl
+[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] 
+[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] *****************************************
+[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] *****************************************
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/contrib/transfer_to_npu.py:299: ImportWarning: 
+    *************************************************************************************************************
+    The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.npu and torch.nn.Module.npu now..
+    The torch.cuda.DoubleTensor is replaced with torch.npu.FloatTensor cause the double type is not supported now..
+    The backend in torch.distributed.init_process_group set to hccl now..
+    The torch.cuda.* and torch.cuda.amp.* are replaced with torch.npu.* and torch.npu.amp.* now..
+    The device parameters have been replaced with npu in the function below:
+    torch.logspace, torch.randint, torch.hann_window, torch.rand, torch.full_like, torch.ones_like, torch.rand_like, torch.randperm, torch.arange, torch.frombuffer, torch.normal, torch._empty_per_channel_affine_quantized, torch.empty_strided, torch.empty_like, torch.scalar_tensor, torch.tril_indices, torch.bartlett_window, torch.ones, torch.sparse_coo_tensor, torch.randn, torch.kaiser_window, torch.tensor, torch.triu_indices, torch.as_tensor, torch.zeros, torch.randint_like, torch.full, torch.eye, torch._sparse_csr_tensor_unsafe, torch.empty, torch._sparse_coo_tensor_unsafe, torch.blackman_window, torch.zeros_like, torch.range, torch.sparse_csr_tensor, torch.randn_like, torch.from_file, torch._cudnn_init_dropout_state, torch._empty_affine_quantized, torch.linspace, torch.hamming_window, torch.empty_quantized, torch._pin_memory, torch.autocast, torch.load, torch.Generator, torch.set_default_device, torch.Tensor.new_empty, torch.Tensor.new_empty_strided, torch.Tensor.new_full, torch.Tensor.new_ones, torch.Tensor.new_tensor, torch.Tensor.new_zeros, torch.Tensor.to, torch.nn.Module.to, torch.nn.Module.to_empty
+    *************************************************************************************************************
+    
+  warnings.warn(msg, ImportWarning)
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/contrib/transfer_to_npu.py:260: RuntimeWarning: torch.jit.script and torch.jit.script_method will be disabled by transfer_to_npu, which currently does not support them, if you need to enable them, please do not use transfer_to_npu.
+  warnings.warn(msg, RuntimeWarning)
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp...
+Emitting ninja build file /root/.cache/torch_extensions/py38_cpu/adaptive_cp/build.ninja...
+Building extension module adaptive_cp...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
+[1/2] c++ -MMD -MF adaptive_cp.o.d -DTORCH_EXTENSION_NAME=adaptive_cp -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/Ascend/ascend-toolkit/latest/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/third_party -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/acl -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/inc -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/TH -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/THC -isystem /root/miniconda3/envs/py38/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -fPIC -pie -Wl,--disable-new-dtags,--rpath -s -O2 -c local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/ops/csrc/algorithm/adaptive_cp/adaptive_cp.cpp -o adaptive_cp.o 
+[2/2] c++ adaptive_cp.o -shared -L/usr/local/Ascend/ascend-toolkit/latest/lib64 -lascendcl -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/lib -ltorch_npu -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o adaptive_cp.so
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+Loading extension module adaptive_cp...
+Loading extension module adaptive_cp...
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
+  warnings.warn("failed to generate the npu_matmul_add_fp32")
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
+  warn(
+using world size: 512, data-parallel size: 32, context-parallel size: 2 tensor-model-parallel size: 8, pipeline-model-parallel size: 1 
+WARNING: Setting args.overlap_p2p_comm to False since non-interleaved schedule does not support overlapping p2p communication
+accumulate and all-reduce gradients in fp32 for bfloat16 data type.
+using torch.bfloat16 for parameters ...
+[INFO] Setting args.use_flash_attn=True since context parallel is enabled.
+[INFO] Setting args.create_attention_mask_in_dataloader to False since reset_data=False or alibi_without_flash_attn=False or args.tokenizer_padding_side=right
+------------------------ ModelLink Arguments ------------------------
+  accumulate_allreduce_grads_in_fp32 .............. True
+  adam_beta1 ...................................... 0.9
+  adam_beta2 ...................................... 0.999
+  adam_eps ........................................ 1e-08
+  adaptive_cp_dynamic_attn_mask ................... False
+  adaptive_cp_manually_set_mask_list .............. False
+  adaptive_cp_only_reschedule ..................... False
+  adaptive_cp_without_coarse ...................... False
+  adaptive_recompute_device_size .................. -1
+  adaptive_recompute_device_swap .................. False
+  adaptive_recompute_profiling_step ............... 10
+  add_bias_linear ................................. False
+  add_class_token ................................. True
+  add_dense_bias .................................. False
+  add_position_embedding .......................... True
+  add_qkv_bias .................................... True
+  add_rmsnorm_offset .............................. False
+  adlr_autoresume ................................. False
+  adlr_autoresume_interval ........................ 1000
+  apply_layernorm_1p .............................. False
+  apply_query_key_layer_scaling ................... False
+  apply_residual_connection_post_layernorm ........ False
+  apply_rope_fusion ............................... True
+  async_tensor_model_parallel_allreduce ........... False
+  attention_dropout ............................... 0.0
+  attention_mask_on_cpu ........................... False
+  attention_softmax_in_fp32 ....................... True
+  attn_logit_softcapping .......................... None
+  auto_detect_ckpt_format ......................... False
+  barrier_with_L1_time ............................ True
+  bert_binary_head ................................ True
+  bert_embedder_type .............................. megatron
+  bert_load ....................................... None
+  bf16 ............................................ True
+  bias_dropout_fusion ............................. True
+  bias_gelu_fusion ................................ False
+  bias_swiglu_fusion .............................. True
+  biencoder_projection_dim ........................ 0
+  biencoder_shared_query_context_model ............ False
+  block_data_path ................................. None
+  check_for_nan_in_loss_and_grad .................. True
+  chunk_size ...................................... 4096
+  ckpt_fully_parallel_save ........................ False
+  ckpt_step ....................................... None
+  classes_fraction ................................ 1.0
+  clip_grad ....................................... 1.0
+  clone_scatter_output_in_embedding ............... True
+  consumed_train_samples .......................... 0
+  consumed_valid_samples .......................... 0
+  context_parallel_algo ........................... megatron_cp_algo
+  context_parallel_size ........................... 2
+  cp_attention_mask_type .......................... causal
+  cp_window_size .................................. 1
+  create_attention_mask_in_dataloader ............. False
+  cross_dataset_joint ............................. True
+  data_cache_path ................................. None
+  data_parallel_random_init ....................... False
+  data_parallel_size .............................. 32
+  data_path ....................................... ['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml']
+  data_per_class_fraction ......................... 1.0
+  data_seq_length ................................. 131072
+  data_sharding ................................... True
+  dataloader_type ................................. single
+  decoder_num_layers .............................. None
+  decoder_seq_length .............................. None
+  decoupled_lr .................................... None
+  decoupled_min_lr ................................ None
+  delay_grad_reduce ............................... True
+  delay_param_gather .............................. False
+  dim_model_base .................................. None
+  dino_bottleneck_size ............................ 256
+  dino_freeze_last_layer .......................... 1
+  dino_head_hidden_size ........................... 2048
+  dino_local_crops_number ......................... 10
+  dino_local_img_size ............................. 96
+  dino_norm_last_layer ............................ False
+  dino_teacher_temp ............................... 0.07
+  dino_warmup_teacher_temp ........................ 0.04
+  dino_warmup_teacher_temp_epochs ................. 30
+  dist_ckpt_format ................................ torch_dist
+  distribute_saved_activations .................... False
+  distributed_backend ............................. nccl
+  distributed_timeout_minutes ..................... 120
+  dpo_beta ........................................ 0.1
+  dpo_ftx ......................................... 0.0
+  dpo_label_smoothing ............................. 0.0
+  dpo_loss_type ................................... sigmoid
+  embed_layernorm ................................. False
+  embedding_multiplier_scale ...................... 1.0
+  embedding_path .................................. None
+  empty_unused_memory_level ....................... 0
+  enable_chunk_memory ............................. False
+  enable_chunk_sequence ........................... False
+  enable_hbmfault_repair .......................... False
+  enable_high_availability ........................ False
+  enable_one_logger ............................... False
+  enable_optimizer_state_local_copy ............... False
+  enable_recompute_layers_per_pp_rank ............. False
+  enable_token_rearrange_opt ...................... False
+  encoder_num_layers .............................. 48
+  encoder_seq_length .............................. 131072
+  end_weight_decay ................................ 0.0
+  eod_mask_loss ................................... False
+  eval_interval ................................... 100
+  eval_iters ...................................... 0
+  evidence_data_path .............................. None
+  exit_duration_in_mins ........................... None
+  exit_interval ................................... None
+  exit_on_missing_checkpoint ...................... False
+  exit_signal_handler ............................. False
+  expert_interval ................................. 1
+  expert_model_parallel_size ...................... 1
+  ffn_hidden_size ................................. 13824
+  fill_neg_inf .................................... False
+  finetune ........................................ True
+  first_k_dense_replace ........................... None
+  first_pipeline_num_layers ....................... 0
+  fp16 ............................................ False
+  fp16_lm_cross_entropy ........................... False
+  fp32_residual_connection ........................ False
+  fp8 ............................................. None
+  fp8_amax_compute_algo ........................... most_recent
+  fp8_amax_history_len ............................ 1
+  fp8_interval .................................... 1
+  fp8_margin ...................................... 0
+  fp8_wgrad ....................................... True
+  full_shuffle_instruction_dataset ................ False
+  geglu ........................................... False
+  gelu_tanh ....................................... False
+  global_batch_size ............................... 64
+  gradient_accumulation_fusion .................... False
+  group_query_attention ........................... True
+  head_lr_mult .................................... 1.0
+  hidden_dropout .................................. 0.0
+  hidden_size ..................................... 5120
+  high_freq_factor ................................ None
+  hysteresis ...................................... 2
+  ict_head_size ................................... None
+  ict_load ........................................ None
+  image_size ...................................... 448
+  image_token_length .............................. 256
+  img_h ........................................... 224
+  img_w ........................................... 224
+  independent_parallel ............................ False
+  indexer_batch_size .............................. 128
+  indexer_log_interval ............................ 1000
+  inference_batch_times_seqlen_threshold .......... 512
+  init_method_std ................................. 0.01
+  init_method_xavier_uniform ...................... False
+  initial_loss_scale .............................. 4096.0
+  input_embeds_norm ............................... False
+  input_jitter .................................... True
+  input_layernorm_in_fp32 ......................... False
+  interleave_sliding_window ....................... None
+  is_instruction_dataset .......................... True
+  is_pairwise_dataset ............................. False
+  iter_per_epoch .................................. 1250
+  jit_compile ..................................... False
+  kv_channels ..................................... 128
+  kv_head_repeat_before_uly_alltoall .............. True
+  kv_lora_rank .................................... None
+  language_model_freeze ........................... False
+  lazy_mpu_init ................................... None
+  load ............................................ /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/
+  load_checkpoint_loosely ......................... False
+  local_rank ...................................... None
+  log_batch_size_to_tensorboard ................... False
+  log_interval .................................... 1
+  log_learning_rate_to_tensorboard ................ True
+  log_loss_scale_to_tensorboard ................... True
+  log_memory_to_tensorboard ....................... False
+  log_num_zeros_in_grad ........................... False
+  log_params_norm ................................. False
+  log_progress .................................... False
+  log_throughput .................................. True
+  log_timers_to_tensorboard ....................... False
+  log_validation_ppl_to_tensorboard ............... False
+  log_world_size_to_tensorboard ................... False
+  logit_mask ...................................... False
+  lora_alpha ...................................... 32
+  lora_fusion ..................................... False
+  lora_load ....................................... None
+  lora_modules_to_save ............................ None
+  lora_r .......................................... 16
+  lora_register_forward_hook ...................... ['word_embeddings', 'input_layernorm']
+  lora_target_modules ............................. []
+  loss_scale ...................................... None
+  loss_scale_window ............................... 1000
+  low_freq_factor ................................. None
+  lr .............................................. 5e-06
+  lr_decay_iters .................................. None
+  lr_decay_samples ................................ None
+  lr_decay_style .................................. cosine
+  lr_warmup_fraction .............................. 0.03
+  lr_warmup_init .................................. 0.0
+  lr_warmup_iters ................................. 0
+  lr_warmup_samples ............................... 0
+  make_vocab_size_divisible_by .................... 1
+  manual_gc ....................................... False
+  manual_gc_eval .................................. True
+  manual_gc_interval .............................. 0
+  mask_factor ..................................... 1.0
+  mask_prob ....................................... 0.15
+  mask_type ....................................... random
+  masked_softmax_fusion ........................... False
+  max_fps ......................................... 1
+  max_num_frame ................................... 512
+  max_num_image ................................... 8
+  max_patch_grid .................................. 12
+  max_position_embeddings ......................... 131072
+  max_tokens_to_oom ............................... 12000
+  merge_file ...................................... None
+  micro_batch_size ................................ 1
+  min_loss_scale .................................. 1.0
+  min_lr .......................................... 1e-07
+  min_patch_grid .................................. 1
+  mmap_bin_files .................................. True
+  mock_data ....................................... False
+  moe_allgather_overlap_comm ...................... False
+  moe_alltoall_overlap_comm ....................... False
+  moe_aux_loss_coeff .............................. 0.0
+  moe_comm_aux_loss_coeff ......................... 0.0
+  moe_device_level_aux_loss_coeff ................. 0.0
+  moe_expert_capacity_factor ...................... None
+  moe_grouped_gemm ................................ False
+  moe_input_jitter_eps ............................ None
+  moe_intermediate_size ........................... None
+  moe_layer_freq .................................. None
+  moe_pad_expert_input_to_capacity ................ False
+  moe_per_layer_logging ........................... False
+  moe_permutation_async_comm ...................... False
+  moe_router_load_balancing_type .................. aux_loss
+  moe_router_topk ................................. 2
+  moe_token_dispatcher_type ....................... allgather
+  moe_token_drop_policy ........................... probs
+  moe_token_dropping .............................. False
+  moe_tp_extend_ep ................................ False
+  moe_train_capacity_factor ....................... 1.0
+  moe_without_activation .......................... False
+  moe_z_loss_coeff ................................ 0.0
+  moe_zero_memory ................................. disable
+  multi_head_latent_attention ..................... False
+  n_shared_experts ................................ None
+  nccl_communicator_config_path ................... None
+  next_tockens .................................... 0
+  no_load_optim ................................... True
+  no_load_rng ..................................... True
+  no_persist_layer_norm ........................... False
+  no_post_layer_norm .............................. False
+  no_save_optim ................................... None
+  no_save_rng ..................................... None
+  no_shared_storage ............................... False
+  no_shuffle ...................................... False
+  noisy_gate_policy ............................... None
+  noop_layers ..................................... None
+  norm_epsilon .................................... 1e-06
+  norm_topk_prob .................................. False
+  normalization ................................... RMSNorm
+  num_attention_heads ............................. 40
+  num_channels .................................... 3
+  num_classes ..................................... 1000
+  num_experts ..................................... None
+  num_layer_list .................................. None
+  num_layers ...................................... 48
+  num_layers_per_virtual_pipeline_stage ........... None
+  num_query_groups ................................ 8
+  num_workers ..................................... 8
+  one_logger_entity ............................... hwinf_dcm
+  one_logger_project .............................. e2e-tracking
+  one_logger_run_name ............................. None
+  onnx_safe ....................................... None
+  openai_gelu ..................................... False
+  optimizer ....................................... adam
+  original_max_position_embeddings ................ None
+  output_bert_embeddings .......................... False
+  output_layer_slice_num .......................... 1
+  output_logit_softcapping ........................ None
+  output_multiplier_scale ......................... None
+  overlap_grad_reduce ............................. True
+  overlap_p2p_comm ................................ False
+  overlap_param_gather ............................ False
+  override_opt_param_scheduler .................... False
+  pad_to_multiple_of .............................. 8
+  padded_vocab_size ............................... 152064
+  params_dtype .................................... torch.bfloat16
+  patch_dim ....................................... 16
+  perform_initialization .......................... True
+  pipeline_model_parallel_size .................... 1
+  pipeline_model_parallel_split_rank .............. None
+  position_embedding_type ......................... rope
+  post_norm ....................................... False
+  pre_tockens ..................................... 65536
+  pref_ftx ........................................ 0.0
+  pretrained_checkpoint ........................... None
+  profile ......................................... False
+  profile_level ................................... level0
+  profile_ranks ................................... [-1]
+  profile_record_shapes ........................... False
+  profile_save_path ............................... ./profile_dir
+  profile_step_end ................................ 12
+  profile_step_start .............................. 10
+  profile_with_cpu ................................ False
+  profile_with_memory ............................. False
+  profile_with_stack .............................. False
+  prompt_format ................................... qwen2
+  prompt_type ..................................... None
+  q_lora_rank ..................................... None
+  qk_layernorm .................................... False
+  qk_nope_head_dim ................................ None
+  qk_rope_head_dim ................................ None
+  query_in_block_prob ............................. 0.1
+  query_pre_attn_scalar ........................... None
+  rampup_batch_size ............................... None
+  rank ............................................ 0
+  recompute_activation_function ................... False
+  recompute_activation_function_num_layers ........ None
+  recompute_granularity ........................... full
+  recompute_in_advance ............................ False
+  recompute_in_bubble ............................. False
+  recompute_method ................................ block
+  recompute_num_layers ............................ 48
+  reduce_recompute_for_last_chunk ................. False
+  ref_model ....................................... None
+  reset_attention_mask ............................ False
+  reset_position_ids .............................. False
+  retriever_report_topk_accuracies ................ []
+  retriever_score_scaling ......................... False
+  retriever_seq_length ............................ 256
+  retro_add_retriever ............................. False
+  retro_attention_gate ............................ 1
+  retro_cyclic_train_iters ........................ None
+  retro_encoder_attention_dropout ................. 0.1
+  retro_encoder_hidden_dropout .................... 0.1
+  retro_encoder_layers ............................ 2
+  retro_num_neighbors ............................. 2
+  retro_num_retrieved_chunks ...................... 2
+  retro_project_dir ............................... None
+  retro_verify_neighbor_count ..................... True
+  reuse_fp32_param ................................ False
+  rope_scaling_beta_fast .......................... 32
+  rope_scaling_beta_slow .......................... 1
+  rope_scaling_factor ............................. 1.0
+  rope_scaling_mscale ............................. 1.0
+  rope_scaling_mscale_all_dim ..................... 0.0
+  rope_scaling_original_max_position_embeddings ... None
+  rope_scaling_type ............................... None
+  rotary_base ..................................... 1000000.0
+  rotary_interleaved .............................. False
+  rotary_percent .................................. 1.0
+  rotary_seq_len_interpolation_factor ............. None
+  routed_scaling_factor ........................... None
+  sample_rate ..................................... 1.0
+  save ............................................ /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//
+  save_interval ................................... 100
+  scale_depth ..................................... None
+  scale_emb ....................................... None
+  scatter_gather_tensors_in_pipeline .............. True
+  seed ............................................ 424242
+  seq_aux ......................................... False
+  seq_length ...................................... 131072
+  sequence_parallel ............................... True
+  sgd_momentum .................................... 0.9
+  shape_order ..................................... SBH
+  shared_expert_gate .............................. False
+  shared_expert_gate_output_dimension ............. 1
+  short_seq_prob .................................. 0.1
+  skip_bias_add ................................... True
+  skip_train ...................................... False
+  sliding_window .................................. None
+  sparse_mode ..................................... 4
+  spec ............................................ None
+  split ........................................... 100,0,0
+  square_alibi_mask ............................... False
+  squared_relu .................................... False
+  stage ........................................... None
+  standalone_embedding_stage ...................... False
+  start_weight_decay .............................. 0.0
+  swap_attention .................................. False
+  swap_modules .................................... None
+  swiglu .......................................... True
+  swin_backbone_type .............................. tiny
+  tensor_model_parallel_size ...................... 8
+  tensorboard_dir ................................. None
+  tensorboard_log_interval ........................ 1
+  tensorboard_queue_size .......................... 1000
+  test_data_path .................................. None
+  test_mode ....................................... False
+  timing_log_level ................................ 0
+  timing_log_option ............................... minmax
+  titles_data_path ................................ None
+  tokenizer_kwargs ................................ None
+  tokenizer_model ................................. None
+  tokenizer_name_or_path .......................... /data_4/models/Qwen/Qwen2.5-14B-Instruct/
+  tokenizer_not_use_fast .......................... True
+  tokenizer_padding_side .......................... right
+  tokenizer_type .................................. PretrainedFromHF
+  topk_group ...................................... None
+  tp_2d ........................................... False
+  tp_comm_bulk_dgrad .............................. True
+  tp_comm_bulk_wgrad .............................. True
+  tp_comm_overlap ................................. False
+  tp_comm_overlap_ag .............................. True
+  tp_comm_overlap_cfg ............................. None
+  tp_comm_overlap_rs .............................. True
+  tp_comm_split_ag ................................ True
+  tp_comm_split_rs ................................ True
+  tp_x ............................................ 1
+  tp_y ............................................ 1
+  train_data_path ................................. None
+  train_iters ..................................... 1000
+  train_samples ................................... None
+  transformer_impl ................................ local
+  transformer_pipeline_model_parallel_size ........ 1
+  ulysses_degree_in_cp ............................ None
+  untie_embeddings_and_output_weights ............. True
+  use_checkpoint_args ............................. False
+  use_checkpoint_opt_param_scheduler .............. False
+  use_cp_send_recv_overlap ........................ True
+  use_cpu_initialization .......................... None
+  use_deter_comp .................................. False
+  use_dist_ckpt ................................... False
+  use_distributed_optimizer ....................... True
+  use_flash_attn .................................. True
+  use_fused_moe_token_permute_and_unpermute ....... False
+  use_fused_ring_attention_update ................. False
+  use_fused_rmsnorm ............................... True
+  use_fused_rotary_pos_emb ........................ True
+  use_fused_swiglu ................................ True
+  use_glm_rope .................................... False
+  use_mc2 ......................................... False
+  use_mcore_models ................................ True
+  use_one_sent_docs ............................... False
+  use_ring_exchange_p2p ........................... False
+  use_rotary_position_embeddings .................. True
+  v_head_dim ...................................... None
+  valid_data_path ................................. None
+  variable_seq_lengths ............................ False
+  virtual_pipeline_model_parallel_size ............ None
+  vision_backbone_type ............................ vit
+  vision_context_parallel ......................... False
+  vision_downsample_ratio ......................... 0.5
+  vision_downsample_stride ........................ 1.0
+  vision_model_freeze ............................. True
+  vision_model_lr_decay_rate ...................... 1.0
+  vision_model_lr_mult ............................ 1.0
+  vision_model_recompute .......................... False
+  vision_model_type ............................... intern_300m
+  vision_normalize_type ........................... imagenet
+  vision_pretraining .............................. False
+  vision_pretraining_type ......................... classify
+  vision_process_type ............................. dynamic
+  vision_projector_freeze ......................... False
+  vision_projector_pre_norm ....................... True
+  vision_projector_recompute ...................... False
+  vision_projector_type ........................... mlp
+  vision_seq_length ............................... 1025
+  vit_load ........................................ /
+  vocab_extra_ids ................................. 0
+  vocab_file ...................................... None
+  vocab_size ...................................... None
+  wandb_exp_name .................................. 
+  wandb_project ................................... 
+  wandb_save_dir .................................. 
+  weight_decay .................................... 0.0
+  weight_decay_incr_style ......................... constant
+  world_size ...................................... 512
+  yaml_cfg ........................................ None
+-------------------- end of ModelLink Arguments ---------------------
+setting number of micro-batches to constant 2
+ > building PretrainFromHF tokenizer. Vocab file is un-used, loading tokenizer from pre-trained model
+> initializing torch distributed ...
+[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
+[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
+[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
+[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
+[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
+all tp groups [[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55], [56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 67, 68, 69, 70, 71], [72, 73, 74, 75, 76, 77, 78, 79], [80, 81, 82, 83, 84, 85, 86, 87], [88, 89, 90, 91, 92, 93, 94, 95], [96, 97, 98, 99, 100, 101, 102, 103], [104, 105, 106, 107, 108, 109, 110, 111], [112, 113, 114, 115, 116, 117, 118, 119], [120, 121, 122, 123, 124, 125, 126, 127], [128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143], [144, 145, 146, 147, 148, 149, 150, 151], [152, 153, 154, 155, 156, 157, 158, 159], [160, 161, 162, 163, 164, 165, 166, 167], [168, 169, 170, 171, 172, 173, 174, 175], [176, 177, 178, 179, 180, 181, 182, 183], [184, 185, 186, 187, 188, 189, 190, 191], [192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207], [208, 209, 210, 211, 212, 213, 214, 215], [216, 217, 218, 219, 220, 221, 222, 223], [224, 225, 226, 227, 228, 229, 230, 231], [232, 233, 234, 235, 236, 237, 238, 239], [240, 241, 242, 243, 244, 245, 246, 247], [248, 249, 250, 251, 252, 253, 254, 255], [256, 257, 258, 259, 260, 261, 262, 263], [264, 265, 266, 267, 268, 269, 270, 271], [272, 273, 274, 275, 276, 277, 278, 279], [280, 281, 282, 283, 284, 285, 286, 287], [288, 289, 290, 291, 292, 293, 294, 295], [296, 297, 298, 299, 300, 301, 302, 303], [304, 305, 306, 307, 308, 309, 310, 311], [312, 313, 314, 315, 316, 317, 318, 319], [320, 321, 322, 323, 324, 325, 326, 327], [328, 329, 330, 331, 332, 333, 334, 335], [336, 337, 338, 339, 340, 341, 342, 343], [344, 345, 346, 347, 348, 349, 350, 351], [352, 353, 354, 355, 356, 357, 358, 359], [360, 361, 362, 363, 364, 365, 366, 367], [368, 369, 370, 371, 372, 373, 374, 375], [376, 377, 378, 379, 380, 381, 382, 383], [384, 385, 386, 387, 388, 389, 390, 391], [392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 402, 403, 404, 405, 406, 407], [408, 409, 410, 411, 412, 413, 414, 415], [416, 417, 418, 419, 420, 421, 422, 423], [424, 425, 426, 427, 428, 429, 430, 431], [432, 433, 434, 435, 436, 437, 438, 439], [440, 441, 442, 443, 444, 445, 446, 447], [448, 449, 450, 451, 452, 453, 454, 455], [456, 457, 458, 459, 460, 461, 462, 463], [464, 465, 466, 467, 468, 469, 470, 471], [472, 473, 474, 475, 476, 477, 478, 479], [480, 481, 482, 483, 484, 485, 486, 487], [488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 501, 502, 503], [504, 505, 506, 507, 508, 509, 510, 511]]
+all ep groups [[0], [8], [16], [24], [32], [40], [48], [56], [64], [72], [80], [88], [96], [104], [112], [120], [128], [136], [144], [152], [160], [168], [176], [184], [192], [200], [208], [216], [224], [232], [240], [248], [256], [264], [272], [280], [288], [296], [304], [312], [320], [328], [336], [344], [352], [360], [368], [376], [384], [392], [400], [408], [416], [424], [432], [440], [448], [456], [464], [472], [480], [488], [496], [504], [1], [9], [17], [25], [33], [41], [49], [57], [65], [73], [81], [89], [97], [105], [113], [121], [129], [137], [145], [153], [161], [169], [177], [185], [193], [201], [209], [217], [225], [233], [241], [249], [257], [265], [273], [281], [289], [297], [305], [313], [321], [329], [337], [345], [353], [361], [369], [377], [385], [393], [401], [409], [417], [425], [433], [441], [449], [457], [465], [473], [481], [489], [497], [505], [2], [10], [18], [26], [34], [42], [50], [58], [66], [74], [82], [90], [98], [106], [114], [122], [130], [138], [146], [154], [162], [170], [178], [186], [194], [202], [210], [218], [226], [234], [242], [250], [258], [266], [274], [282], [290], [298], [306], [314], [322], [330], [338], [346], [354], [362], [370], [378], [386], [394], [402], [410], [418], [426], [434], [442], [450], [458], [466], [474], [482], [490], [498], [506], [3], [11], [19], [27], [35], [43], [51], [59], [67], [75], [83], [91], [99], [107], [115], [123], [131], [139], [147], [155], [163], [171], [179], [187], [195], [203], [211], [219], [227], [235], [243], [251], [259], [267], [275], [283], [291], [299], [307], [315], [323], [331], [339], [347], [355], [363], [371], [379], [387], [395], [403], [411], [419], [427], [435], [443], [451], [459], [467], [475], [483], [491], [499], [507], [4], [12], [20], [28], [36], [44], [52], [60], [68], [76], [84], [92], [100], [108], [116], [124], [132], [140], [148], [156], [164], [172], [180], [188], [196], [204], [212], [220], [228], [236], [244], [252], [260], [268], [276], [284], [292], [300], [308], [316], [324], [332], [340], [348], [356], [364], [372], [380], [388], [396], [404], [412], [420], [428], [436], [444], [452], [460], [468], [476], [484], [492], [500], [508], [5], [13], [21], [29], [37], [45], [53], [61], [69], [77], [85], [93], [101], [109], [117], [125], [133], [141], [149], [157], [165], [173], [181], [189], [197], [205], [213], [221], [229], [237], [245], [253], [261], [269], [277], [285], [293], [301], [309], [317], [325], [333], [341], [349], [357], [365], [373], [381], [389], [397], [405], [413], [421], [429], [437], [445], [453], [461], [469], [477], [485], [493], [501], [509], [6], [14], [22], [30], [38], [46], [54], [62], [70], [78], [86], [94], [102], [110], [118], [126], [134], [142], [150], [158], [166], [174], [182], [190], [198], [206], [214], [222], [230], [238], [246], [254], [262], [270], [278], [286], [294], [302], [310], [318], [326], [334], [342], [350], [358], [366], [374], [382], [390], [398], [406], [414], [422], [430], [438], [446], [454], [462], [470], [478], [486], [494], [502], [510], [7], [15], [23], [31], [39], [47], [55], [63], [71], [79], [87], [95], [103], [111], [119], [127], [135], [143], [151], [159], [167], [175], [183], [191], [199], [207], [215], [223], [231], [239], [247], [255], [263], [271], [279], [287], [295], [303], [311], [319], [327], [335], [343], [351], [359], [367], [375], [383], [391], [399], [407], [415], [423], [431], [439], [447], [455], [463], [471], [479], [487], [495], [503], [511]]
+all dp groups [[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496], [1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, 449, 465, 481, 497], [2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, 418, 434, 450, 466, 482, 498], [3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 403, 419, 435, 451, 467, 483, 499], [4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 404, 420, 436, 452, 468, 484, 500], [5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501], [6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502], [7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, 503], [8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, 472, 488, 504], [9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, 441, 457, 473, 489, 505], [10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, 410, 426, 442, 458, 474, 490, 506], [11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, 411, 427, 443, 459, 475, 491, 507], [12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508], [13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509], [14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510], [15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, 495, 511]]
+all_dp_modulo_exp_group_ranks [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504], [1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 257, 265, 273, 281, 289, 297, 305, 313, 321, 329, 337, 345, 353, 361, 369, 377, 385, 393, 401, 409, 417, 425, 433, 441, 449, 457, 465, 473, 481, 489, 497, 505], [2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 258, 266, 274, 282, 290, 298, 306, 314, 322, 330, 338, 346, 354, 362, 370, 378, 386, 394, 402, 410, 418, 426, 434, 442, 450, 458, 466, 474, 482, 490, 498, 506], [3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 259, 267, 275, 283, 291, 299, 307, 315, 323, 331, 339, 347, 355, 363, 371, 379, 387, 395, 403, 411, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507], [4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 260, 268, 276, 284, 292, 300, 308, 316, 324, 332, 340, 348, 356, 364, 372, 380, 388, 396, 404, 412, 420, 428, 436, 444, 452, 460, 468, 476, 484, 492, 500, 508], [5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 261, 269, 277, 285, 293, 301, 309, 317, 325, 333, 341, 349, 357, 365, 373, 381, 389, 397, 405, 413, 421, 429, 437, 445, 453, 461, 469, 477, 485, 493, 501, 509], [6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 270, 278, 286, 294, 302, 310, 318, 326, 334, 342, 350, 358, 366, 374, 382, 390, 398, 406, 414, 422, 430, 438, 446, 454, 462, 470, 478, 486, 494, 502, 510], [7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, 263, 271, 279, 287, 295, 303, 311, 319, 327, 335, 343, 351, 359, 367, 375, 383, 391, 399, 407, 415, 423, 431, 439, 447, 455, 463, 471, 479, 487, 495, 503, 511]]
+all_tensor_and_expert_group_ranks [[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55], [56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 67, 68, 69, 70, 71], [72, 73, 74, 75, 76, 77, 78, 79], [80, 81, 82, 83, 84, 85, 86, 87], [88, 89, 90, 91, 92, 93, 94, 95], [96, 97, 98, 99, 100, 101, 102, 103], [104, 105, 106, 107, 108, 109, 110, 111], [112, 113, 114, 115, 116, 117, 118, 119], [120, 121, 122, 123, 124, 125, 126, 127], [128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143], [144, 145, 146, 147, 148, 149, 150, 151], [152, 153, 154, 155, 156, 157, 158, 159], [160, 161, 162, 163, 164, 165, 166, 167], [168, 169, 170, 171, 172, 173, 174, 175], [176, 177, 178, 179, 180, 181, 182, 183], [184, 185, 186, 187, 188, 189, 190, 191], [192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207], [208, 209, 210, 211, 212, 213, 214, 215], [216, 217, 218, 219, 220, 221, 222, 223], [224, 225, 226, 227, 228, 229, 230, 231], [232, 233, 234, 235, 236, 237, 238, 239], [240, 241, 242, 243, 244, 245, 246, 247], [248, 249, 250, 251, 252, 253, 254, 255], [256, 257, 258, 259, 260, 261, 262, 263], [264, 265, 266, 267, 268, 269, 270, 271], [272, 273, 274, 275, 276, 277, 278, 279], [280, 281, 282, 283, 284, 285, 286, 287], [288, 289, 290, 291, 292, 293, 294, 295], [296, 297, 298, 299, 300, 301, 302, 303], [304, 305, 306, 307, 308, 309, 310, 311], [312, 313, 314, 315, 316, 317, 318, 319], [320, 321, 322, 323, 324, 325, 326, 327], [328, 329, 330, 331, 332, 333, 334, 335], [336, 337, 338, 339, 340, 341, 342, 343], [344, 345, 346, 347, 348, 349, 350, 351], [352, 353, 354, 355, 356, 357, 358, 359], [360, 361, 362, 363, 364, 365, 366, 367], [368, 369, 370, 371, 372, 373, 374, 375], [376, 377, 378, 379, 380, 381, 382, 383], [384, 385, 386, 387, 388, 389, 390, 391], [392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 402, 403, 404, 405, 406, 407], [408, 409, 410, 411, 412, 413, 414, 415], [416, 417, 418, 419, 420, 421, 422, 423], [424, 425, 426, 427, 428, 429, 430, 431], [432, 433, 434, 435, 436, 437, 438, 439], [440, 441, 442, 443, 444, 445, 446, 447], [448, 449, 450, 451, 452, 453, 454, 455], [456, 457, 458, 459, 460, 461, 462, 463], [464, 465, 466, 467, 468, 469, 470, 471], [472, 473, 474, 475, 476, 477, 478, 479], [480, 481, 482, 483, 484, 485, 486, 487], [488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 501, 502, 503], [504, 505, 506, 507, 508, 509, 510, 511]]
+all_data_parallel_group_ranks_with_cp [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504], [1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 257, 265, 273, 281, 289, 297, 305, 313, 321, 329, 337, 345, 353, 361, 369, 377, 385, 393, 401, 409, 417, 425, 433, 441, 449, 457, 465, 473, 481, 489, 497, 505], [2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 258, 266, 274, 282, 290, 298, 306, 314, 322, 330, 338, 346, 354, 362, 370, 378, 386, 394, 402, 410, 418, 426, 434, 442, 450, 458, 466, 474, 482, 490, 498, 506], [3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 259, 267, 275, 283, 291, 299, 307, 315, 323, 331, 339, 347, 355, 363, 371, 379, 387, 395, 403, 411, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507], [4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 260, 268, 276, 284, 292, 300, 308, 316, 324, 332, 340, 348, 356, 364, 372, 380, 388, 396, 404, 412, 420, 428, 436, 444, 452, 460, 468, 476, 484, 492, 500, 508], [5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 261, 269, 277, 285, 293, 301, 309, 317, 325, 333, 341, 349, 357, 365, 373, 381, 389, 397, 405, 413, 421, 429, 437, 445, 453, 461, 469, 477, 485, 493, 501, 509], [6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 270, 278, 286, 294, 302, 310, 318, 326, 334, 342, 350, 358, 366, 374, 382, 390, 398, 406, 414, 422, 430, 438, 446, 454, 462, 470, 478, 486, 494, 502, 510], [7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, 263, 271, 279, 287, 295, 303, 311, 319, 327, 335, 343, 351, 359, 367, 375, 383, 391, 399, 407, 415, 423, 431, 439, 447, 455, 463, 471, 479, 487, 495, 503, 511]]
+> initialized tensor model parallel with size 8
+> initialized pipeline model parallel with size 1
+> setting random seeds to 424242 ...
+> compiling dataset index builder ...
+make: Entering directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets'
+make: Nothing to be done for 'default'.
+make: Leaving directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets'
+>>> done with dataset index builder. Compilation time: 0.074 seconds
+time to initialize megatron (seconds): -34.653
+[after megatron is initialized] datetime: 2024-11-27 12:45:17 
+building GPT model ...
+Building megatron mcore vision language model ...
+vision_projector_recompute False
+vision_projector_recompute False
+vision_projector_recompute False
+vision_projector_recompute False
+vision_projector_recompute Falsevision_projector_recompute Falsevision_projector_recompute False
+
+
+vision_projector_recompute False
+vision_projector_recompute False
+vision_projector_recompute False
+vision_projector_recompute False
+vision_projector_recompute False
+vision_projector_recompute False
+model_provider args Namespace(accumulate_allreduce_grads_in_fp32=True, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-08, adaptive_cp_dynamic_attn_mask=False, adaptive_cp_manually_set_mask_list=False, adaptive_cp_only_reschedule=False, adaptive_cp_without_coarse=False, adaptive_recompute_device_size=-1, adaptive_recompute_device_swap=False, adaptive_recompute_profiling_step=10, add_bias_linear=False, add_class_token=True, add_dense_bias=False, add_position_embedding=True, add_qkv_bias=True, add_rmsnorm_offset=False, adlr_autoresume=False, adlr_autoresume_interval=1000, apply_layernorm_1p=False, apply_query_key_layer_scaling=False, apply_residual_connection_post_layernorm=False, apply_rope_fusion=True, async_tensor_model_parallel_allreduce=False, attention_dropout=0.0, attention_mask_on_cpu=False, attention_softmax_in_fp32=True, attn_logit_softcapping=None, auto_detect_ckpt_format=False, barrier_with_L1_time=True, bert_binary_head=True, bert_embedder_type='megatron', bert_load=None, bf16=True, bias_dropout_fusion=True, bias_gelu_fusion=False, bias_swiglu_fusion=True, biencoder_projection_dim=0, biencoder_shared_query_context_model=False, block_data_path=None, check_for_nan_in_loss_and_grad=True, chunk_size=4096, ckpt_fully_parallel_save=False, ckpt_step=None, classes_fraction=1.0, clip_grad=1.0, clone_scatter_output_in_embedding=True, consumed_train_samples=0, consumed_valid_samples=0, context_parallel_algo='megatron_cp_algo', context_parallel_size=2, cp_attention_mask_type='causal', cp_window_size=1, create_attention_mask_in_dataloader=False, cross_dataset_joint=True, data_cache_path=None, data_parallel_random_init=False, data_parallel_size=32, data_path=['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml'], data_per_class_fraction=1.0, data_seq_length=131072, data_sharding=True, dataloader_type='single', decoder_num_layers=None, decoder_seq_length=None, decoupled_lr=None, decoupled_min_lr=None, delay_grad_reduce=True, delay_param_gather=False, dim_model_base=None, dino_bottleneck_size=256, dino_freeze_last_layer=1, dino_head_hidden_size=2048, dino_local_crops_number=10, dino_local_img_size=96, dino_norm_last_layer=False, dino_teacher_temp=0.07, dino_warmup_teacher_temp=0.04, dino_warmup_teacher_temp_epochs=30, dist_ckpt_format='torch_dist', distribute_saved_activations=False, distributed_backend='nccl', distributed_timeout_minutes=120, dpo_beta=0.1, dpo_ftx=0.0, dpo_label_smoothing=0.0, dpo_loss_type='sigmoid', embed_layernorm=False, embedding_multiplier_scale=1.0, embedding_path=None, empty_unused_memory_level=0, enable_chunk_memory=False, enable_chunk_sequence=False, enable_hbmfault_repair=False, enable_high_availability=False, enable_one_logger=False, enable_optimizer_state_local_copy=False, enable_recompute_layers_per_pp_rank=False, enable_token_rearrange_opt=False, encoder_num_layers=48, encoder_seq_length=131072, end_weight_decay=0.0, eod_mask_loss=False, eval_interval=100, eval_iters=0, evidence_data_path=None, exit_duration_in_mins=None, exit_interval=None, exit_on_missing_checkpoint=False, exit_signal_handler=False, expert_interval=1, expert_model_parallel_size=1, ffn_hidden_size=13824, fill_neg_inf=False, finetune=True, first_k_dense_replace=None, first_pipeline_num_layers=0, fp16=False, fp16_lm_cross_entropy=False, fp32_residual_connection=False, fp8=None, fp8_amax_compute_algo='most_recent', fp8_amax_history_len=1, fp8_interval=1, fp8_margin=0, fp8_wgrad=True, full_shuffle_instruction_dataset=False, geglu=False, gelu_tanh=False, global_batch_size=64, gradient_accumulation_fusion=False, group_query_attention=True, head_lr_mult=1.0, hidden_dropout=0.0, hidden_size=5120, high_freq_factor=None, hysteresis=2, ict_head_size=None, ict_load=None, image_size=448, image_token_length=256, img_h=224, img_w=224, independent_parallel=False, indexer_batch_size=128, indexer_log_interval=1000, inference_batch_times_seqlen_threshold=512, init_method_std=0.01, init_method_xavier_uniform=False, initial_loss_scale=4096.0, input_embeds_norm=False, input_jitter=True, input_layernorm_in_fp32=False, interleave_sliding_window=None, is_instruction_dataset=True, is_pairwise_dataset=False, iter_per_epoch=1250, jit_compile=False, kv_channels=128, kv_head_repeat_before_uly_alltoall=True, kv_lora_rank=None, language_model_freeze=False, lazy_mpu_init=None, load='/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/', load_checkpoint_loosely=False, local_rank=0, log_batch_size_to_tensorboard=False, log_interval=1, log_learning_rate_to_tensorboard=True, log_loss_scale_to_tensorboard=True, log_memory_to_tensorboard=False, log_num_zeros_in_grad=False, log_params_norm=False, log_progress=False, log_throughput=True, log_timers_to_tensorboard=False, log_validation_ppl_to_tensorboard=False, log_world_size_to_tensorboard=False, logit_mask=False, lora_alpha=32, lora_fusion=False, lora_load=None, lora_modules_to_save=None, lora_r=16, lora_register_forward_hook=['word_embeddings', 'input_layernorm'], lora_target_modules=[], loss_scale=None, loss_scale_window=1000, low_freq_factor=None, lr=5e-06, lr_decay_iters=None, lr_decay_samples=None, lr_decay_style='cosine', lr_warmup_fraction=0.03, lr_warmup_init=0.0, lr_warmup_iters=0, lr_warmup_samples=0, make_vocab_size_divisible_by=1, manual_gc=False, manual_gc_eval=True, manual_gc_interval=0, mask_factor=1.0, mask_prob=0.15, mask_type='random', masked_softmax_fusion=False, max_fps=1, max_num_frame=512, max_num_image=8, max_patch_grid=12, max_position_embeddings=131072, max_tokens_to_oom=12000, merge_file=None, micro_batch_size=1, min_loss_scale=1.0, min_lr=1e-07, min_patch_grid=1, mmap_bin_files=True, mock_data=False, model_type=<ModelType.encoder_or_decoder: 1>, moe_allgather_overlap_comm=False, moe_alltoall_overlap_comm=False, moe_aux_loss_coeff=0.0, moe_comm_aux_loss_coeff=0.0, moe_device_level_aux_loss_coeff=0.0, moe_expert_capacity_factor=None, moe_grouped_gemm=False, moe_input_jitter_eps=None, moe_intermediate_size=None, moe_layer_freq=None, moe_pad_expert_input_to_capacity=False, moe_per_layer_logging=False, moe_permutation_async_comm=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_token_dispatcher_type='allgather', moe_token_drop_policy='probs', moe_token_dropping=False, moe_tp_extend_ep=False, moe_train_capacity_factor=1.0, moe_without_activation=False, moe_z_loss_coeff=0.0, moe_zero_memory='disable', multi_head_latent_attention=False, n_shared_experts=None, nccl_communicator_config_path=None, next_tockens=0, no_load_optim=True, no_load_rng=True, no_persist_layer_norm=False, no_post_layer_norm=False, no_save_optim=None, no_save_rng=None, no_shared_storage=False, no_shuffle=False, noisy_gate_policy=None, noop_layers=None, norm_epsilon=1e-06, norm_topk_prob=False, normalization='RMSNorm', num_attention_heads=40, num_channels=3, num_classes=1000, num_experts=None, num_layer_list=None, num_layers=48, num_layers_per_virtual_pipeline_stage=None, num_query_groups=8, num_workers=8, one_logger_entity='hwinf_dcm', one_logger_project='e2e-tracking', one_logger_run_name=None, onnx_safe=None, openai_gelu=False, optimizer='adam', original_max_position_embeddings=None, output_bert_embeddings=False, output_layer_slice_num=1, output_logit_softcapping=None, output_multiplier_scale=None, overlap_grad_reduce=True, overlap_p2p_comm=False, overlap_param_gather=False, override_opt_param_scheduler=False, pad_to_multiple_of=8, padded_vocab_size=152064, params_dtype=torch.bfloat16, patch_dim=16, perform_initialization=True, pipeline_model_parallel_size=1, pipeline_model_parallel_split_rank=None, position_embedding_type='rope', post_norm=False, pre_tockens=65536, pref_ftx=0.0, pretrained_checkpoint=None, profile=False, profile_level='level0', profile_ranks=[-1], profile_record_shapes=False, profile_save_path='./profile_dir', profile_step_end=12, profile_step_start=10, profile_with_cpu=False, profile_with_memory=False, profile_with_stack=False, prompt_format='qwen2', prompt_type=None, q_lora_rank=None, qk_layernorm=False, qk_nope_head_dim=None, qk_rope_head_dim=None, query_in_block_prob=0.1, query_pre_attn_scalar=None, rampup_batch_size=None, rank=0, recompute_activation_function=False, recompute_activation_function_num_layers=None, recompute_granularity='full', recompute_in_advance=False, recompute_in_bubble=False, recompute_method='block', recompute_num_layers=48, reduce_recompute_for_last_chunk=False, ref_model=None, reset_attention_mask=False, reset_position_ids=False, retriever_report_topk_accuracies=[], retriever_score_scaling=False, retriever_seq_length=256, retro_add_retriever=False, retro_attention_gate=1, retro_cyclic_train_iters=None, retro_encoder_attention_dropout=0.1, retro_encoder_hidden_dropout=0.1, retro_encoder_layers=2, retro_num_neighbors=2, retro_num_retrieved_chunks=2, retro_project_dir=None, retro_verify_neighbor_count=True, reuse_fp32_param=False, rope_scaling_beta_fast=32, rope_scaling_beta_slow=1, rope_scaling_factor=1.0, rope_scaling_mscale=1.0, rope_scaling_mscale_all_dim=0.0, rope_scaling_original_max_position_embeddings=None, rope_scaling_type=None, rotary_base=1000000.0, rotary_interleaved=False, rotary_percent=1.0, rotary_seq_len_interpolation_factor=None, routed_scaling_factor=None, sample_rate=1.0, save='/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//', save_interval=100, scale_depth=None, scale_emb=None, scatter_gather_tensors_in_pipeline=True, seed=424242, seq_aux=False, seq_length=131072, sequence_parallel=True, sgd_momentum=0.9, shape_order='SBH', shared_expert_gate=False, shared_expert_gate_output_dimension=1, short_seq_prob=0.1, skip_bias_add=True, skip_train=False, sliding_window=None, sparse_mode=4, spec=None, split='100,0,0', square_alibi_mask=False, squared_relu=False, stage=None, standalone_embedding_stage=False, start_weight_decay=0.0, swap_attention=False, swap_modules=None, swiglu=True, swin_backbone_type='tiny', tensor_model_parallel_size=8, tensorboard_dir=None, tensorboard_log_interval=1, tensorboard_queue_size=1000, test_data_path=None, test_mode=False, timing_log_level=0, timing_log_option='minmax', titles_data_path=None, tokenizer_kwargs=None, tokenizer_model=None, tokenizer_name_or_path='/data_4/models/Qwen/Qwen2.5-14B-Instruct/', tokenizer_not_use_fast=True, tokenizer_padding_side='right', tokenizer_type='PretrainedFromHF', topk_group=None, tp_2d=False, tp_comm_bulk_dgrad=True, tp_comm_bulk_wgrad=True, tp_comm_overlap=False, tp_comm_overlap_ag=True, tp_comm_overlap_cfg=None, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_split_rs=True, tp_x=1, tp_y=1, train_data_path=None, train_iters=1000, train_samples=None, transformer_impl='local', transformer_pipeline_model_parallel_size=1, ulysses_degree_in_cp=None, untie_embeddings_and_output_weights=True, use_checkpoint_args=False, use_checkpoint_opt_param_scheduler=False, use_cp_send_recv_overlap=True, use_cpu_initialization=None, use_deter_comp=False, use_dist_ckpt=False, use_distributed_optimizer=True, use_flash_attn=True, use_fused_moe_token_permute_and_unpermute=False, use_fused_ring_attention_update=False, use_fused_rmsnorm=True, use_fused_rotary_pos_emb=True, use_fused_swiglu=True, use_glm_rope=False, use_mc2=False, use_mcore_models=True, use_one_sent_docs=False, use_ring_exchange_p2p=False, use_rotary_position_embeddings=True, v_head_dim=None, valid_data_path=None, variable_seq_lengths=False, virtual_pipeline_model_parallel_size=None, vision_backbone_type='vit', vision_context_parallel=False, vision_downsample_ratio=0.5, vision_downsample_stride=1.0, vision_model_freeze=True, vision_model_lr_decay_rate=1.0, vision_model_lr_mult=1.0, vision_model_recompute=False, vision_model_type='intern_300m', vision_normalize_type='imagenet', vision_pretraining=False, vision_pretraining_type='classify', vision_process_type='dynamic', vision_projector_freeze=False, vision_projector_pre_norm=True, vision_projector_recompute=False, vision_projector_type='mlp', vision_seq_length=1025, vit_load='/', vocab_extra_ids=0, vocab_file=None, vocab_size=None, wandb_exp_name='', wandb_project='', wandb_save_dir='', weight_decay=0.0, weight_decay_incr_style='constant', world_size=512, yaml_cfg=None)vision_projector_recompute False
+
+model_provider config TransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=2, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=48, first_pipeline_num_layers=0, independent_parallel=False, hidden_size=5120, num_attention_heads=40, num_query_groups=8, ffn_hidden_size=13824, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=True, gated_linear_unit=True, activation_func=<function silu at 0x7fb291a70940>, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=False, test_mode=False, init_method=<function init_method_normal.<locals>.init_ at 0x7fb2591db310>, output_layer_init_method=<function scaled_init_method_normal.<locals>.init_ at 0x7fb23001b310>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=True, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=True, recompute_granularity='full', recompute_method='block', recompute_num_layers=48, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=131072, rotary_percent=1.0)
+vision_projector_recompute Falsemodel_provider transformer_layer_spec ModuleSpec(module=<class 'megatron.core.transformer.transformer_layer.TransformerLayer'>, params={}, submodules=TransformerLayerSubmodules(input_layernorm=<class 'modellink.core.transformer.custom_layers.transformer_engine.PTNorm'>, self_attention=ModuleSpec(module=<class 'megatron.core.transformer.attention.SelfAttention'>, params={'attn_mask_type': <AttnMaskType.causal: 2>}, submodules=SelfAttentionSubmodules(linear_qkv=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, core_attention=<class 'megatron.core.transformer.dot_product_attention.DotProductAttention'>, linear_proj=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>, q_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, k_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>)), post_attn_norm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, self_attn_bda=<function get_bias_dropout_add at 0x7fb27185ddc0>, pre_cross_attn_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, cross_attention=<class 'megatron.core.transformer.identity_op.IdentityOp'>, cross_attn_bda=<class 'megatron.core.transformer.identity_op.IdentityFuncOp'>, pre_mlp_layernorm=<class 'modellink.core.transformer.custom_layers.transformer_engine.PTNorm'>, mlp=ModuleSpec(module=<class 'megatron.core.transformer.mlp.MLP'>, params={}, submodules=MLPSubmodules(linear_fc1=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, linear_fc2=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>)), post_mlp_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, mlp_bda=<function get_bias_dropout_add at 0x7fb27185ddc0>, sharded_state_dict_keys_map={'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_'}))
+
+vision_projector_recompute False
+Building intern_300m model ...
+------------------------ vit_args ------------------------
+  accumulate_allreduce_grads_in_fp32 .............. True
+  activation_func ................................. <built-in function gelu>
+  adam_beta1 ...................................... 0.9
+  adam_beta2 ...................................... 0.999
+  adam_eps ........................................ 1e-08
+  adaptive_cp_dynamic_attn_mask ................... False
+  adaptive_cp_manually_set_mask_list .............. False
+  adaptive_cp_only_reschedule ..................... False
+  adaptive_cp_without_coarse ...................... False
+  adaptive_recompute_device_size .................. -1
+  adaptive_recompute_device_swap .................. False
+  adaptive_recompute_profiling_step ............... 10
+  add_bias_linear ................................. True
+  add_class_token ................................. True
+  add_dense_bias .................................. False
+  add_position_embedding .......................... True
+  add_qkv_bias .................................... True
+  add_rmsnorm_offset .............................. False
+  adlr_autoresume ................................. False
+  adlr_autoresume_interval ........................ 1000
+  apply_layernorm_1p .............................. False
+  apply_query_key_layer_scaling ................... False
+  apply_residual_connection_post_layernorm ........ False
+  apply_rope_fusion ............................... False
+  async_tensor_model_parallel_allreduce ........... False
+  attention_dropout ............................... 0.0
+  attention_mask_on_cpu ........................... False
+  attention_softmax_in_fp32 ....................... True
+  attn_logit_softcapping .......................... None
+  auto_detect_ckpt_format ......................... False
+  barrier_with_L1_time ............................ True
+  bert_binary_head ................................ True
+  bert_embedder_type .............................. megatron
+  bert_load ....................................... None
+  bf16 ............................................ True
+  bias_activation_fusion .......................... False
+  bias_dropout_fusion ............................. False
+  bias_gelu_fusion ................................ False
+  bias_swiglu_fusion .............................. True
+  biencoder_projection_dim ........................ 0
+  biencoder_shared_query_context_model ............ False
+  block_data_path ................................. None
+  check_for_nan_in_loss_and_grad .................. True
+  chunk_size ...................................... 4096
+  ckpt_fully_parallel_save ........................ False
+  ckpt_step ....................................... None
+  classes_fraction ................................ 1.0
+  clip_grad ....................................... 1.0
+  clone_scatter_output_in_embedding ............... True
+  consumed_train_samples .......................... 0
+  consumed_valid_samples .......................... 0
+  context_parallel_algo ........................... megatron_cp_algo
+  context_parallel_size ........................... 1
+  cp_attention_mask_type .......................... causal
+  cp_window_size .................................. 1
+  create_attention_mask_in_dataloader ............. False
+  cross_dataset_joint ............................. True
+  data_cache_path ................................. None
+  data_parallel_random_init ....................... False
+  data_parallel_size .............................. 32
+  data_path ....................................... ['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml']
+  data_per_class_fraction ......................... 1.0
+  data_seq_length ................................. 131072
+  data_sharding ................................... True
+  dataloader_type ................................. single
+  decoder_num_layers .............................. None
+  decoder_seq_length .............................. 1025
+  decoupled_lr .................................... None
+  decoupled_min_lr ................................ None
+  delay_grad_reduce ............................... True
+  delay_param_gather .............................. False
+  dim_model_base .................................. None
+  dino_bottleneck_size ............................ 256
+  dino_freeze_last_layer .......................... 1
+  dino_head_hidden_size ........................... 2048
+  dino_local_crops_number ......................... 10
+  dino_local_img_size ............................. 96
+  dino_norm_last_layer ............................ False
+  dino_teacher_temp ............................... 0.07
+  dino_warmup_teacher_temp ........................ 0.04
+  dino_warmup_teacher_temp_epochs ................. 30
+  dist_ckpt_format ................................ torch_dist
+  distribute_saved_activations .................... False
+  distributed_backend ............................. nccl
+  distributed_timeout_minutes ..................... 120
+  dpo_beta ........................................ 0.1
+  dpo_ftx ......................................... 0.0
+  dpo_label_smoothing ............................. 0.0
+  dpo_loss_type ................................... sigmoid
+  embed_layernorm ................................. False
+  embedding_multiplier_scale ...................... 1.0
+  embedding_path .................................. None
+  empty_unused_memory_level ....................... 0
+  enable_chunk_memory ............................. False
+  enable_chunk_sequence ........................... False
+  enable_hbmfault_repair .......................... False
+  enable_high_availability ........................ False
+  enable_one_logger ............................... False
+  enable_optimizer_state_local_copy ............... False
+  enable_recompute_layers_per_pp_rank ............. False
+  enable_token_rearrange_opt ...................... False
+  encoder_num_layers .............................. 48
+  encoder_seq_length .............................. 1025
+  end_weight_decay ................................ 0.0
+  eod_mask_loss ................................... False
+  eval_interval ................................... 100
+  eval_iters ...................................... 0
+  evidence_data_path .............................. None
+  exit_duration_in_mins ........................... None
+  exit_interval ................................... None
+  exit_on_missing_checkpoint ...................... False
+  exit_signal_handler ............................. False
+  expert_interval ................................. 1
+  expert_model_parallel_size ...................... 1
+  ffn_hidden_size ................................. 4096
+  fill_neg_inf .................................... False
+  finetune ........................................ True
+  first_k_dense_replace ........................... None
+  first_pipeline_num_layers ....................... 0
+  fp16 ............................................ False
+  fp16_lm_cross_entropy ........................... False
+  fp32_residual_connection ........................ False
+  fp8 ............................................. None
+  fp8_amax_compute_algo ........................... most_recent
+  fp8_amax_history_len ............................ 1
+  fp8_interval .................................... 1
+  fp8_margin ...................................... 0
+  fp8_wgrad ....................................... True
+  full_shuffle_instruction_dataset ................ False
+  gated_linear_unit ............................... False
+  geglu ........................................... False
+  gelu_tanh ....................................... False
+  global_batch_size ............................... 64
+  gradient_accumulation_fusion .................... False
+  group_query_attention ........................... False
+  head_lr_mult .................................... 1.0
+  hidden_dropout .................................. 0.0
+  hidden_size ..................................... 1024
+  high_freq_factor ................................ None
+  hysteresis ...................................... 2
+  ict_head_size ................................... None
+  ict_load ........................................ None
+  image_size ...................................... 448
+  image_token_length .............................. 256
+  img_h ........................................... 448
+  img_w ........................................... 448
+  independent_parallel ............................ True
+  indexer_batch_size .............................. 128
+  indexer_log_interval ............................ 1000
+  inference_batch_times_seqlen_threshold .......... 512
+  init_method_std ................................. 0.01
+  init_method_xavier_uniform ...................... False
+  initial_loss_scale .............................. 4096.0
+  input_embeds_norm ............................... False
+  input_jitter .................................... True
+  input_layernorm_in_fp32 ......................... False
+  interleave_sliding_window ....................... None
+  is_instruction_dataset .......................... True
+  is_pairwise_dataset ............................. False
+  iter_per_epoch .................................. 1250
+  jit_compile ..................................... False
+  kv_channels ..................................... 64
+  kv_head_repeat_before_uly_alltoall .............. True
+  kv_lora_rank .................................... None
+  language_model_freeze ........................... False
+  layernorm_zero_centered_gamma ................... False
+  lazy_mpu_init ................................... None
+  load ............................................ /
+  load_checkpoint_loosely ......................... False
+  local_rank ...................................... 0
+  log_batch_size_to_tensorboard ................... False
+  log_interval .................................... 1
+  log_learning_rate_to_tensorboard ................ True
+  log_loss_scale_to_tensorboard ................... True
+  log_memory_to_tensorboard ....................... False
+  log_num_zeros_in_grad ........................... False
+  log_params_norm ................................. False
+  log_progress .................................... False
+  log_throughput .................................. True
+  log_timers_to_tensorboard ....................... False
+  log_validation_ppl_to_tensorboard ............... False
+  log_world_size_to_tensorboard ................... False
+  logit_mask ...................................... False
+  lora_alpha ...................................... 32
+  lora_fusion ..................................... False
+  lora_load ....................................... None
+  lora_modules_to_save ............................ None
+  lora_r .......................................... 16
+  lora_register_forward_hook ...................... ['word_embeddings', 'input_layernorm']
+  lora_target_modules ............................. []
+  loss_scale ...................................... None
+  loss_scale_window ............................... 1000
+  low_freq_factor ................................. None
+  lr .............................................. 5e-06
+  lr_decay_iters .................................. None
+  lr_decay_samples ................................ None
+  lr_decay_style .................................. cosine
+  lr_warmup_fraction .............................. 0.03
+  lr_warmup_init .................................. 0.0
+  lr_warmup_iters ................................. 0
+  lr_warmup_samples ............................... 0
+  make_vocab_size_divisible_by .................... 1
+  manual_gc ....................................... False
+  manual_gc_eval .................................. True
+  manual_gc_interval .............................. 0
+  mask_factor ..................................... 1.0
+  mask_prob ....................................... 0.15
+  mask_type ....................................... random
+  masked_softmax_fusion ........................... False
+  max_fps ......................................... 1
+  max_num_frame ................................... 512
+  max_num_image ................................... 8
+  max_patch_grid .................................. 12
+  max_position_embeddings ......................... 1025
+  max_tokens_to_oom ............................... 12000
+  merge_file ...................................... None
+  micro_batch_size ................................ 1
+  min_loss_scale .................................. 1.0
+  min_lr .......................................... 1e-07
+  min_patch_grid .................................. 1
+  mmap_bin_files .................................. True
+  mock_data ....................................... False
+  model_type ...................................... ModelType.encoder_or_decoder
+  moe_allgather_overlap_comm ...................... False
+  moe_alltoall_overlap_comm ....................... False
+  moe_aux_loss_coeff .............................. 0.0
+  moe_comm_aux_loss_coeff ......................... 0.0
+  moe_device_level_aux_loss_coeff ................. 0.0
+  moe_expert_capacity_factor ...................... None
+  moe_grouped_gemm ................................ False
+  moe_input_jitter_eps ............................ None
+  moe_intermediate_size ........................... None
+  moe_layer_freq .................................. None
+  moe_pad_expert_input_to_capacity ................ False
+  moe_per_layer_logging ........................... False
+  moe_permutation_async_comm ...................... False
+  moe_router_load_balancing_type .................. aux_loss
+  moe_router_topk ................................. 2
+  moe_token_dispatcher_type ....................... allgather
+  moe_token_drop_policy ........................... probs
+  moe_token_dropping .............................. False
+  moe_tp_extend_ep ................................ False
+  moe_train_capacity_factor ....................... 1.0
+  moe_without_activation .......................... False
+  moe_z_loss_coeff ................................ 0.0
+  moe_zero_memory ................................. disable
+  multi_head_latent_attention ..................... False
+  n_shared_experts ................................ None
+  nccl_communicator_config_path ................... None
+  next_tockens .................................... 0
+  no_load_optim ................................... True
+  no_load_rng ..................................... True
+  no_persist_layer_norm ........................... False
+  no_post_layer_norm .............................. False
+  no_save_optim ................................... None
+  no_save_rng ..................................... None
+  no_shared_storage ............................... False
+  no_shuffle ...................................... False
+  noisy_gate_policy ............................... None
+  noop_layers ..................................... None
+  norm_epsilon .................................... 1e-06
+  norm_topk_prob .................................. False
+  normalization ................................... LayerNorm
+  num_attention_heads ............................. 16
+  num_channels .................................... 3
+  num_classes ..................................... 1000
+  num_experts ..................................... None
+  num_layer_list .................................. 
+  num_layers ...................................... 24
+  num_layers_per_virtual_pipeline_stage ........... None
+  num_query_groups ................................ 16
+  num_workers ..................................... 8
+  one_logger_entity ............................... hwinf_dcm
+  one_logger_project .............................. e2e-tracking
+  one_logger_run_name ............................. None
+  onnx_safe ....................................... None
+  openai_gelu ..................................... False
+  optimizer ....................................... adam
+  original_max_position_embeddings ................ None
+  output_bert_embeddings .......................... False
+  output_layer_slice_num .......................... 1
+  output_logit_softcapping ........................ None
+  output_multiplier_scale ......................... None
+  overlap_grad_reduce ............................. False
+  overlap_p2p_comm ................................ False
+  overlap_param_gather ............................ False
+  override_opt_param_scheduler .................... False
+  pad_to_multiple_of .............................. 8
+  padded_vocab_size ............................... 152064
+  params_dtype .................................... torch.bfloat16
+  patch_dim ....................................... 14
+  perform_initialization .......................... True
+  pipeline_model_parallel_size .................... 1
+  pipeline_model_parallel_split_rank .............. None
+  position_embedding_type ......................... rope
+  post_norm ....................................... False
+  pre_tockens ..................................... 65536
+  pref_ftx ........................................ 0.0
+  pretrained_checkpoint ........................... None
+  profile ......................................... False
+  profile_level ................................... level0
+  profile_ranks ................................... [-1]
+  profile_record_shapes ........................... False
+  profile_save_path ............................... ./profile_dir
+  profile_step_end ................................ 12
+  profile_step_start .............................. 10
+  profile_with_cpu ................................ False
+  profile_with_memory ............................. False
+  profile_with_stack .............................. False
+  prompt_format ................................... qwen2
+  prompt_type ..................................... None
+  q_lora_rank ..................................... None
+  qk_layernorm .................................... False
+  qk_nope_head_dim ................................ None
+  qk_rope_head_dim ................................ None
+  query_in_block_prob ............................. 0.1
+  query_pre_attn_scalar ........................... None
+  rampup_batch_size ............................... None
+  rank ............................................ 0
+  recompute_activation_function ................... False
+  recompute_activation_function_num_layers ........ None
+  recompute_granularity ........................... None
+  recompute_in_advance ............................ False
+  recompute_in_bubble ............................. False
+  recompute_method ................................ None
+  recompute_num_layers ............................ None
+  reduce_recompute_for_last_chunk ................. False
+  ref_model ....................................... None
+  reset_attention_mask ............................ False
+  reset_position_ids .............................. False
+  retriever_report_topk_accuracies ................ []
+  retriever_score_scaling ......................... False
+  retriever_seq_length ............................ 256
+  retro_add_retriever ............................. False
+  retro_attention_gate ............................ 1
+  retro_cyclic_train_iters ........................ None
+  retro_encoder_attention_dropout ................. 0.1
+  retro_encoder_hidden_dropout .................... 0.1
+  retro_encoder_layers ............................ 2
+  retro_num_neighbors ............................. 2
+  retro_num_retrieved_chunks ...................... 2
+  retro_project_dir ............................... None
+  retro_verify_neighbor_count ..................... True
+  reuse_fp32_param ................................ False
+  rope_scaling_beta_fast .......................... 32
+  rope_scaling_beta_slow .......................... 1
+  rope_scaling_factor ............................. 1.0
+  rope_scaling_mscale ............................. 1.0
+  rope_scaling_mscale_all_dim ..................... 0.0
+  rope_scaling_original_max_position_embeddings ... None
+  rope_scaling_type ............................... None
+  rotary_base ..................................... 1000000.0
+  rotary_interleaved .............................. False
+  rotary_percent .................................. 1.0
+  rotary_seq_len_interpolation_factor ............. None
+  routed_scaling_factor ........................... None
+  sample_rate ..................................... 1.0
+  save ............................................ /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//
+  save_interval ................................... 100
+  scale_depth ..................................... None
+  scale_emb ....................................... None
+  scatter_gather_tensors_in_pipeline .............. True
+  seed ............................................ 424242
+  seq_aux ......................................... False
+  seq_length ...................................... 1025
+  sequence_parallel ............................... False
+  sgd_momentum .................................... 0.9
+  shape_order ..................................... SBH
+  shared_expert_gate .............................. False
+  shared_expert_gate_output_dimension ............. 1
+  short_seq_prob .................................. 0.1
+  skip_bias_add ................................... True
+  skip_train ...................................... False
+  sliding_window .................................. None
+  sparse_mode ..................................... 4
+  spec ............................................ None
+  split ........................................... 100,0,0
+  square_alibi_mask ............................... False
+  squared_relu .................................... False
+  stage ........................................... None
+  standalone_embedding_stage ...................... False
+  start_weight_decay .............................. 0.0
+  swap_attention .................................. False
+  swap_modules .................................... None
+  swiglu .......................................... False
+  swin_backbone_type .............................. tiny
+  tensor_model_parallel_size ...................... 8
+  tensorboard_dir ................................. None
+  tensorboard_log_interval ........................ 1
+  tensorboard_queue_size .......................... 1000
+  test_data_path .................................. None
+  test_mode ....................................... False
+  timing_log_level ................................ 0
+  timing_log_option ............................... minmax
+  titles_data_path ................................ None
+  tokenizer_kwargs ................................ None
+  tokenizer_model ................................. None
+  tokenizer_name_or_path .......................... /data_4/models/Qwen/Qwen2.5-14B-Instruct/
+  tokenizer_not_use_fast .......................... True
+  tokenizer_padding_side .......................... right
+  tokenizer_type .................................. PretrainedFromHF
+  topk_group ...................................... None
+  tp_2d ........................................... False
+  tp_comm_bulk_dgrad .............................. True
+  tp_comm_bulk_wgrad .............................. True
+  tp_comm_overlap ................................. False
+  tp_comm_overlap_ag .............................. True
+  tp_comm_overlap_cfg ............................. None
+  tp_comm_overlap_rs .............................. True
+  tp_comm_split_ag ................................ True
+  tp_comm_split_rs ................................ True
+  tp_x ............................................ 1
+  tp_y ............................................ 1
+  train_data_path ................................. None
+  train_iters ..................................... 1000
+  train_samples ................................... None
+  transformer_impl ................................ local
+  transformer_pipeline_model_parallel_size ........ 1
+  ulysses_degree_in_cp ............................ None
+  untie_embeddings_and_output_weights ............. True
+  use_checkpoint_args ............................. False
+  use_checkpoint_opt_param_scheduler .............. False
+  use_cp_send_recv_overlap ........................ True
+  use_cpu_initialization .......................... None
+  use_deter_comp .................................. False
+  use_dist_ckpt ................................... False
+  use_distributed_optimizer ....................... True
+  use_flash_attn .................................. True
+  use_fused_moe_token_permute_and_unpermute ....... False
+  use_fused_ring_attention_update ................. False
+  use_fused_rmsnorm ............................... True
+  use_fused_rotary_pos_emb ........................ True
+  use_fused_swiglu ................................ True
+  use_glm_rope .................................... False
+  use_mc2 ......................................... False
+  use_mcore_models ................................ True
+  use_one_sent_docs ............................... False
+  use_ring_exchange_p2p ........................... False
+  use_rotary_position_embeddings .................. True
+  v_head_dim ...................................... None
+  valid_data_path ................................. None
+  variable_seq_lengths ............................ False
+  virtual_pipeline_model_parallel_size ............ None
+  vision_backbone_type ............................ vit
+  vision_context_parallel ......................... False
+  vision_downsample_ratio ......................... 0.5
+  vision_downsample_stride ........................ 1.0
+  vision_model_freeze ............................. True
+  vision_model_lr_decay_rate ...................... 1.0
+  vision_model_lr_mult ............................ 1.0
+  vision_model_recompute .......................... False
+  vision_model_type ............................... intern_300m
+  vision_normalize_type ........................... imagenet
+  vision_pretraining .............................. False
+  vision_pretraining_type ......................... classify
+  vision_process_type ............................. dynamic
+  vision_projector_freeze ......................... False
+  vision_projector_pre_norm ....................... True
+  vision_projector_recompute ...................... False
+  vision_projector_type ........................... mlp
+  vision_seq_length ............................... 1025
+  vit_load ........................................ /
+  vocab_extra_ids ................................. 0
+  vocab_file ...................................... None
+  vocab_size ...................................... None
+  wandb_exp_name .................................. 
+  wandb_project ................................... 
+  wandb_save_dir .................................. 
+  weight_decay .................................... 0.0
+  weight_decay_incr_style ......................... constant
+  world_size ...................................... 512
+  yaml_cfg ........................................ None
+-------------------- end of vit_args ---------------------
+MegatronVisionModel vision_model_config VisionTransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=24, first_pipeline_num_layers=0, independent_parallel=True, hidden_size=1024, num_attention_heads=16, num_query_groups=16, ffn_hidden_size=4096, kv_channels=64, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=True, add_qkv_bias=True, gated_linear_unit=False, activation_func=<built-in function gelu>, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='LayerNorm', qk_layernorm=False, test_mode=False, init_method=<function init_method_normal.<locals>.init_ at 0x7fb23001b3a0>, output_layer_init_method=<function scaled_init_method_normal.<locals>.init_ at 0x7fb23001be50>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=False, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=False, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=1025, rotary_percent=1.0, img_w=448, img_h=448, patch_dim=14)
+MegatronVisionModel vision_model_layer_spec ModuleSpec(module=<class 'lcvlm_modellink.core.models.vision.intern_vit_model.InternViTTransformerLayer'>, params={}, submodules=TransformerLayerSubmodules(input_layernorm=<class 'modellink.core.transformer.custom_layers.transformer_engine.PTNorm'>, self_attention=ModuleSpec(module=<class 'megatron.core.transformer.attention.SelfAttention'>, params={'attn_mask_type': <AttnMaskType.no_mask: 3>}, submodules=SelfAttentionSubmodules(linear_qkv=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, core_attention=<class 'megatron.core.transformer.dot_product_attention.DotProductAttention'>, linear_proj=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>, q_layernorm=None, k_layernorm=None)), post_attn_norm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, self_attn_bda=<class 'megatron.core.transformer.identity_op.IdentityFuncOp'>, pre_cross_attn_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, cross_attention=<class 'megatron.core.transformer.identity_op.IdentityOp'>, cross_attn_bda=<class 'megatron.core.transformer.identity_op.IdentityFuncOp'>, pre_mlp_layernorm=<class 'modellink.core.transformer.custom_layers.transformer_engine.PTNorm'>, mlp=ModuleSpec(module=<class 'megatron.core.transformer.mlp.MLP'>, params={}, submodules=MLPSubmodules(linear_fc1=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, linear_fc2=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>)), post_mlp_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, mlp_bda=<class 'megatron.core.transformer.identity_op.IdentityFuncOp'>, sharded_state_dict_keys_map={}))
+WARNING: could not find the metadata file /latest_checkpointed_iteration.txt 
+    will not load any checkpoints and will start from random
+vision_projector_config TransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=2, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=48, first_pipeline_num_layers=0, independent_parallel=False, hidden_size=5120, num_attention_heads=40, num_query_groups=8, ffn_hidden_size=1024, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=True, gated_linear_unit=False, activation_func=<built-in function gelu>, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=False, test_mode=False, init_method=<function init_method_normal.<locals>.init_ at 0x7fb23001bee0>, output_layer_init_method=<function scaled_init_method_normal.<locals>.init_ at 0x7fb23001bf70>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=False, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=True, recompute_granularity='full', recompute_method='block', recompute_num_layers=48, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=131072, rotary_percent=1.0)
+vision_projector_layer_spec MLPSubmodules(linear_fc1=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, linear_fc2=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>)
+vision_model_freeze
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+vision_model_freeze=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+vision_model_freeze
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+vision_model_freeze
+
+
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+vision_model_freeze
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+
+
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)
+
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 1887497216
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.vision_model_freeze
+
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+vision_model_freeze=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+vision_model_freeze=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+ > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+
+
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+ > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 1887497216
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+ > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+ > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 1887497216
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 1887497216
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 1887497216
+ > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 1887497216
+vision_model_freeze
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+ > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 1887497216
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)
+
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+ > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 1887497216
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+ > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 1887497216
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)
+ > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 1887497216
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1887497216
+ > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 1887497216
+vision_model_freeze
+=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
+=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
+=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
+=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
+model GPTVLModel(
+  (external_feature_model): MegatronVisionModel(
+    (vit): InternViTModel(
+      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
+      (position_embeddings): Embedding(1025, 1024)
+      (decoder): TransformerBlock(
+        (layers): ModuleList(
+          (0-23): 24 x InternViTTransformerLayer(
+            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (self_attention): SelfAttention(
+              (core_attention): DotProductAttention(
+                (scale_mask_softmax): FusedScaleMaskSoftmax()
+                (attention_dropout): Dropout(p=0.0, inplace=False)
+              )
+              (linear_proj): RowParallelLinear()
+              (linear_qkv): ColumnParallelLinear()
+            )
+            (self_attn_bda): IdentityFuncOp()
+            (pre_cross_attn_layernorm): IdentityOp()
+            (cross_attention): IdentityOp()
+            (cross_attn_bda): IdentityFuncOp()
+            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+            (mlp): MLP(
+              (linear_fc1): ColumnParallelLinear()
+              (linear_fc2): RowParallelLinear()
+            )
+            (mlp_bda): IdentityFuncOp()
+          )
+        )
+      )
+    )
+    (vision_projection): MultimodalProjector(
+      (encoder): MLP(
+        (linear_fc1): ColumnParallelLinear()
+        (linear_fc2): RowParallelLinear()
+      )
+    )
+    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
+  )
+  (embedding): LanguageModelEmbedding(
+    (word_embeddings): VocabParallelEmbedding()
+    (embedding_dropout): Dropout(p=0.0, inplace=False)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (decoder): TransformerBlock(
+    (layers): ModuleList(
+      (0-47): 48 x TransformerLayer(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (core_attention): DotProductAttention(
+            (scale_mask_softmax): FusedScaleMaskSoftmax()
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (linear_proj): RowParallelLinear()
+          (linear_qkv): ColumnParallelLinear()
+          (q_layernorm): IdentityOp()
+          (k_layernorm): IdentityOp()
+        )
+        (pre_cross_attn_layernorm): IdentityOp()
+        (cross_attention): IdentityOp()
+        (cross_attn_bda): IdentityFuncOp()
+        (pre_mlp_layernorm): RMSNorm()
+        (mlp): MLP(
+          (linear_fc1): ColumnParallelLinear()
+          (linear_fc2): RowParallelLinear()
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): ColumnParallelLinear()
+)
+ > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 1887497216
+INFO:megatron.core.distributed.param_and_grad_buffer:Number of buckets for gradient all-reduce / reduce-scatter: 39
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 1 (97320960 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.output_layer.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 2 (43270016 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.final_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 3 (52112256 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 4 (42287872 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 5 (43270016 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 6 (52112256 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 7 (42287872 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 8 (43270016 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 9 (52112256 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 10 (42287872 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 11 (43270016 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 12 (52112256 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 13 (42287872 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 14 (43270016 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 15 (52112256 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 16 (42287872 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 17 (43270016 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 18 (52112256 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.mlp.linear_fc2.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 19 (42287872 elements):
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.self_attention.linear_qkv.bias
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.mlp.linear_fc2.weight
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.self_attention.linear_qkv.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.self_attention.linear_proj.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.input_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.pre_mlp_layernorm.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.self_attention.linear_qkv.bias
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 20 (43270016 elements):
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.input_layernorm.weight
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.pre_mlp_layernorm.weight
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.self_attention.linear_qkv.bias
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.mlp.linear_fc2.weight
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.mlp.linear_fc1.weight
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.self_attention.linear_proj.weight
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.input_layernorm.weight
+
+_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 21 (52112256 elements):
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.self_attention.linear_proj.weight
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.mlp.linear_fc1.weight_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 22 (42287872 elements):
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.input_layernorm.weight_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.self_attention.linear_proj.weight
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.self_attention.linear_qkv.weight
+
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 23 (43270016 elements):
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.self_attention.linear_proj.weight
+
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 24 (52112256 elements):_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.input_layernorm.weight
+
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 25 (42287872 elements):
+
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.self_attention.linear_proj.weight
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 26 (43270016 elements):
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.input_layernorm.weight
+
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.self_attention.linear_qkv.weight
+
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.self_attention.linear_proj.weight
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 27 (52112256 elements):
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.self_attention.linear_proj.weight
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.input_layernorm.weight
+
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 28 (42287872 elements):_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.pre_mlp_layernorm.weight
+
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.self_attention.linear_qkv.bias
+
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.mlp.linear_fc1.weight
+
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.self_attention.linear_qkv.weight
+
+
+
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.self_attention.linear_qkv.weight
+
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.self_attention.linear_proj.weight
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 29 (43270016 elements):
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.mlp.linear_fc2.weight
+
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.mlp.linear_fc2.weight
+
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.self_attention.linear_qkv.weight
+
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.self_attention.linear_proj.weight
+
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 30 (52112256 elements):
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.mlp.linear_fc1.weight
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.self_attention.linear_proj.weight
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.self_attention.linear_qkv.weight
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.input_layernorm.weight
+
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.self_attention.linear_qkv.bias_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.mlp.linear_fc2.weight
+
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.mlp.linear_fc1.weight
+
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 31 (42287872 elements):_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.self_attention.linear_qkv.bias
+
+
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.self_attention.linear_proj.weight
+
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.input_layernorm.weight_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 32 (43270016 elements):
+
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.input_layernorm.weight
+
+
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.input_layernorm.weight
+
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.self_attention.linear_qkv.bias
+
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 33 (52112256 elements):
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+
+
+
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.input_layernorm.weight
+
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 34 (42287872 elements):_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.input_layernorm.weight_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.self_attention.linear_proj.weight
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.pre_mlp_layernorm.weight
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.pre_mlp_layernorm.weight
+
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.mlp.linear_fc2.weight
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.mlp.linear_fc1.weight
+
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 35 (43270016 elements):
+
+_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.self_attention.linear_qkv.weight
+
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.self_attention.linear_proj.weight
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.input_layernorm.weight
+
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.mlp.linear_fc2.weight
+_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 36 (52112256 elements):_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+
+
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.pre_mlp_layernorm.weight
+
+
+
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.mlp.linear_fc1.weight
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.input_layernorm.weight
+
+
+
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 37 (42287872 elements):
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.self_attention.linear_qkv.weight
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.input_layernorm.weight_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.self_attention.linear_proj.weight
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.pre_mlp_layernorm.weight
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.self_attention.linear_qkv.bias
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.mlp.linear_fc2.weight_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.mlp.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+
+
+
+
+
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.self_attention.linear_proj.weight
+
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 38 (97326080 elements):
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.input_layernorm.weight
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.embedding.word_embeddings.weight
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 39 (1187840 elements):
+
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.external_feature_model.pre_proj_layernorm.bias_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.external_feature_model.pre_proj_layernorm.weight
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:megatron.core.distributed.param_and_grad_buffer:    module.external_feature_model.vision_projection.encoder.linear_fc1.weight
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.external_feature_model.vision_projection.encoder.linear_fc2.weight
+
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+
+
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+
+
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:megatron.core.optimizer:Setting up optimizer with OptimizerConfig(optimizer='adam', lr=5e-06, min_lr=1e-07, decoupled_lr=None, decoupled_min_lr=None, weight_decay=0.0, fp16=False, bf16=True, params_dtype=torch.bfloat16, loss_scale=None, initial_loss_scale=4096.0, min_loss_scale=1.0, loss_scale_window=1000, hysteresis=2, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-08, sgd_momentum=0.9, use_distributed_optimizer=True, overlap_grad_reduce=True, overlap_param_gather=False, clip_grad=1.0, log_num_zeros_in_grad=False, barrier_with_L1_time=True, timers=<megatron.core.timers.Timers object at 0x7fb26878a880>)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
+
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+
+_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
+_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
+_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
+> learning rate decay style: cosine
+_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration _load_base_checkpoint iteration    5000 500050005000_load_base_checkpoint iteration  _load_base_checkpoint iteration 5000_load_base_checkpoint iteration 
+50005000 
+
+ 5000_load_base_checkpoint release
+5000
+
+ 500050005000 
+5000
+5000
+_load_base_checkpoint release _load_base_checkpoint release
+ _load_base_checkpoint release 
+_load_base_checkpoint release_load_base_checkpoint release
+_load_base_checkpoint release
+ _load_base_checkpoint release_load_base_checkpoint releaseFalse_load_base_checkpoint release _load_base_checkpoint release
+  5000_load_base_checkpoint releaseFalse  5000  _load_base_checkpoint release  
+ FalseFalseFalse
+
+
+False_load_base_checkpoint release
+
+ FalseFalse
+False
+False
+ FalseFalse
+_load_base_checkpoint releaseFalse
+
+_load_base_checkpoint releaseFalse
+
+ 
+ 
+False
+False
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt
+_load_base_checkpoint_load_base_checkpoint  /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt
+
+_load_base_checkpoint_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt 
+/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt
+ loading checkpoint from /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ at iteration 5000
+_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+could not find arguments in the checkpoint ...
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+load_checkpoint iteration 0
+load_checkpoint release False
+strict True
+ checkpoint version 3.0
+  successfully loaded checkpoint from /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ [ t 0, p 0 ] at iteration 0
+[after model, optimizer, and learning rate scheduler are built] datetime: 2024-11-27 12:45:29 
+> building train, validation, and test datasets ...
+ > datasets target sizes (minimum size):
+    train:      64000
+    validation: 0
+    test:       0
+INFO:megatron.core.datasets.blended_megatron_dataset_config:mock = False
+INFO:megatron.core.datasets.blended_megatron_dataset_config:Let split_matrix = [(0, 1.0), None, None]
+> building train, validation, and test datasets for GPT ...
+> rank 12 does not create GPT datasets ...
+> rank 9 does not create GPT datasets ...> rank 5 does not create GPT datasets ...
+
+> rank 6 does not create GPT datasets ...
+> rank 14 does not create GPT datasets ...
+> rank 2 does not create GPT datasets ...
+> rank 3 does not create GPT datasets ...
+> rank 15 does not create GPT datasets ...
+> rank 4 does not create GPT datasets ...> rank 10 does not create GPT datasets ...
+> rank 11 does not create GPT datasets ...
+
+> rank 7 does not create GPT datasets ...
+> rank 13 does not create GPT datasets ...
+> rank 8 is creating GPT datasets ...
+tokenizer Qwen2TokenizerFast(name_or_path='/data_4/models/Qwen/Qwen2.5-14B-Instruct/', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
+	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
+	151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
+	151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
+	151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
+	151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
+	151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
+	151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
+	151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
+	151665: AddedToken("<img>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151666: AddedToken("</img>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151667: AddedToken("<IMG_CONTEXT>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151668: AddedToken("<vid>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151669: AddedToken("</vid>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151670: AddedToken("<VID_CONTEXT>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151671: AddedToken("<patch>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151672: AddedToken("</patch>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151673: AddedToken("<PATCH_CONTEXT>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151674: AddedToken("<quad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151675: AddedToken("</quad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151676: AddedToken("<ref>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151677: AddedToken("</ref>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151678: AddedToken("<box>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151679: AddedToken("</box>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151680: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	151681: AddedToken("<video>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+> rank 0 is creating GPT datasets ...
+> rank 1 does not create GPT datasets ...
+INFO:root:building dataset...
+INFO:cognitron_vl.data.dataset_base:cfg {'xlsx_sample_num': 2, 'dataset': {'LLaVA': {'ratio': 0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/liuhaotian/LLaVA-Instruct-150K/llava_v1_5_mix665k.jsonl']}, 'LLaVA-ReCap': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-558K/data.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-118K/data.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-CC3M/data.jsonl']}, 'ALLaVA': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_vflan/ALLaVA-Caption-VFLAN-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Caption-LAION-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_vflan/ALLaVA-Instruct-VFLAN-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Instruct-LAION-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_text/Evol-Instruct-GPT4-Turbo-143K.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/others/Instruct-LAION-4oiterative.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/others/Instruct-LAION-4v_gemini_claude-ensembled.jsonl']}, 'LVIS': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/X2FD/LVIS-Instruct4V/lvis_instruct4v_220k.jsonl']}, 'ShareGPT4V': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/share-captioner_coco_lcs_sam_1246k_1107.jsonl', 'datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl']}, 'the_cauldron': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/chart2text.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/cocoqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/datikz.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/diagram_image_to_text.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/docvqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/finqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/geomverse.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/iam.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/intergps.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/localized_narratives.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/mimic_cgd.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/multihiertt.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/nlvr2.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/ocrvqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/okvqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/plotqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/raven.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/rendered_text.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/robut_sqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/robut_wikisql.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/screen2words.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/spot_the_diff.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/tabmwp.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/tat_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/textvqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/vistext.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/visualmrc.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/vqav2.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/websight.jsonl']}, 'Docmatix': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl']}, 'LLaVA-OneVision-Mid-Data': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/evol_instruct/evol_instruct_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/synthdog_en/synthdog_en_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/synthdog_zh/synthdog_zh_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/ureader_tr/ureader_tr_processed.jsonl']}, 'LLaVA-OneVision-Data': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/CLEVR-Math(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/FigureQA(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/GEOS(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/GeoQA+(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/Geometry3K(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/IconQA(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/MapQA(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/PMC-VQA(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/Super-CLEVR(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/TabMWP(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/UniGeo(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/VisualWebInstruct(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/VizWiz(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(gpt4v).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(internvl).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/aokvqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/cambrian(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/chartqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/chrome_writting.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/clevr(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/dvqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/figureqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo170k(align).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo170k(qa).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo3k.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hateful_memes(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hitab(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hme100k.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/iconqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/iiit5k.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/image_textualization(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic(gpt4v).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic_vqa.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic_vqa_llava_format.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/k12_printing.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/llavar_gpt4_20k.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/lrv_chart.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/lrv_normal(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(l3_80b_mt).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(l3_80b_st).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(qwen2_72b_st).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mapqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mathqa.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mavis_math_metagen.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mavis_math_rule_geo.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/orand_car_a.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/robut_wtq(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/scienceqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/scienceqa(nona_context).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/sharegpt4o.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/sroie.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/st_vqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/tallyqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/textcaps.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/textocr(gpt4v).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/tqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_cap.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_ie.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_kg.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vision_flan(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/visual7w(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vqarad(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vsr(cauldron,llava_format).jsonl']}, 'M4-Instruct-Data': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/M4-Instruct-Data/m4_instruct_annotations.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/M4-Instruct-Data/m4_instruct_video.jsonl']}, 'OpenHermes': {'ratio': 0.1, 'num': 300000, 'data_paths': ['datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl']}, 'lima': {'ratio': 0.1, 'data_paths': ['datasets/LLM/jsons/GAIR/lima/train.jsonl']}, 'databricks-dolly-15k': {'ratio': 0.1, 'data_paths': ['datasets/LLM/jsons/databricks/databricks-dolly-15k/databricks-dolly-15k.jsonl']}, 'MetaMathQA': {'ratio': 0.1, 'num': 100000, 'data_paths': ['datasets/LLM/jsons/meta-math/MetaMathQA/MetaMathQA-395K.jsonl']}, 'MathInstruct': {'ratio': 0.1, 'num': 80000, 'data_paths': ['datasets/LLM/jsons/TIGER-Lab/MathInstruct/MathInstruct.jsonl']}, 'orca-math-word-problems-200k': {'ratio': 0.1, 'num': 50000, 'data_paths': ['datasets/LLM/jsons/microsoft/orca-math-word-problems-200k/data.jsonl']}, 'atlas-math-sets': {'ratio': 0.1, 'num': 100000, 'data_paths': ['datasets/LLM/jsons/AtlasUnified/atlas-math-sets/train.jsonl']}, 'goat': {'ratio': 0.1, 'num': 30000, 'data_paths': ['datasets/LLM/jsons/tiedong/goat/dataset.jsonl']}, 'camel-ai': {'ratio': 0.1, 'num': 30000, 'data_paths': ['datasets/LLM/jsons/camel-ai/math/math.jsonl']}, 'Long-Instruction-with-Paraphrasing': {'ratio': 1.0, 'data_paths': ['datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_en.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_en_paraphrasing.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_en.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_alpaca_en.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_en_paraphrasing.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/translation_en2zh.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_zh.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_zh_paraphrasing.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_zh.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_llama_chinese.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_zh_paraphrasing.jsonl']}, 'Long': {'ratio': 1.0, 'data_paths': ['datasets/LLM/jsons/akoksal/LongForm/data.jsonl', 'datasets/LLM/jsons/THUDM/LongAlign-10k/long.jsonl', 'datasets/LLM/jsons/THUDM/LongCite-45k/long.jsonl', 'datasets/LLM/jsons/THUDM/LongWriter-6k/long.jsonl', 'datasets/LLM/jsons/YeungNLP/LongQLoRA-Dataset/LongQLoRA-SFT-Data-39k.jsonl', 'datasets/LLM/jsons/Yukang/LongAlpaca-12k/LongAlpaca-12k.jsonl', 'datasets/LLM/jsons/togethercomputer/Long-Data-Collections/natural_questions_10_200_docs.jsonl', 'datasets/LLM/jsons/togethercomputer/Long-Data-Collections/booksum.jsonl']}, 'VideoGPT-plus_Training_Dataset': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/caption_videochat.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/classification_k710.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/classification_ssv2.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochat1.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochatgpt.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_clevrer_mc.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_clevrer_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_next_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vcg_human_annotated.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vcg-plus_112K.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vqa_webvid_qa.jsonl']}, 'Share14/ShareGemini_cap': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/Share14/ShareGemini/sharegemini_k400.jsonl', 'datasets/LMM/lcvlm_jsonl/Share14/ShareGemini/sharegemini_webvid_core100k.jsonl']}, 'Movie': {'ratio': 0.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/MovieNet/MovieNet.jsonl']}, 'Comic': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/Comic/summary.jsonl']}, 'lmms-lab/LLaVA-Video-178K_0_30_s_cap': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_v0_1_cap_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_cap_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_0_30_s_qa': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_activitynetqa/0_30_s_activitynetqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_nextqa/0_30_s_nextqa_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_nextqa/0_30_s_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_perceptiontest/0_30_s_perceptiontest_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_oe_v0_1_qa_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_1_2_m_cap': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_v0_1_cap_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_v0_1_cap_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_1_2_m_qa': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_activitynetqa/1_2_m_activitynetqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_oe_v0_1_qa_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_2_3_m_cap': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_v0_1_cap_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_v0_1_cap_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_2_3_m_qa': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_activitynetqa/2_3_m_activitynetqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_nextqa/2_3_m_nextqa_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_nextqa/2_3_m_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_oe_v0_1_qa_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_30_60_s_cap': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_v0_1_cap_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_v0_1_cap_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_30_60_s_qa': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_activitynetqa/30_60_s_activitynetqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_nextqa/30_60_s_nextqa_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_nextqa/30_60_s_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_perceptiontest/30_60_s_perceptiontest_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/llava_hound/sharegptvideo_qa_255k_processed.jsonl']}}}
+target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)]
+possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]]
+target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)]
+possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]]
+INFO:cognitron_vl.data.dataset_base:total_num 558128
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 55812
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 55812
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 55812
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-558K/data.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 118319
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 11831
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 11831
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 67643
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-118K/data.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 2857560
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 285756
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 285756
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 353399
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-CC3M/data.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 185352
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 18535
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 18535
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 371934
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_vflan/ALLaVA-Caption-VFLAN-4V.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 468497
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 46849
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 46849
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 418783
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Caption-LAION-4V.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 77000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 7700
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 7700
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 426483
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_vflan/ALLaVA-Instruct-VFLAN-4V.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 468497
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 46849
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 46849
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 473332
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Instruct-LAION-4V.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 142964
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 14296
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 14296
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 487628
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_text/Evol-Instruct-GPT4-Turbo-143K.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 49898
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4989
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 4989
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 492617
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/others/Instruct-LAION-4oiterative.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 49925
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4992
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 4992
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 497609
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/others/Instruct-LAION-4v_gemini_claude-ensembled.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 222711
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 22271
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 22271
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 519880
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/X2FD/LVIS-Instruct4V/lvis_instruct4v_220k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1244418
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 124441
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 124441
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 644321
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/share-captioner_coco_lcs_sam_1246k_1107.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 100024
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 10002
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 10002
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 654323
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 26961
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2696
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2696
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 657019
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/chart2text.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 46287
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4628
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 4628
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 661647
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/cocoqa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 47974
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4797
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 4797
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 666444
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/datikz.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 300
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 30
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 30
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 666474
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/diagram_image_to_text.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 10189
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1018
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1018
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 667492
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/docvqa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 5276
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 527
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 527
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 668019
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/finqa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 9303
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 930
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 930
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 668949
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/geomverse.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 5663
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 566
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 566
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 669515
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/iam.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1280
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 128
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 128
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 669643
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/intergps.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 199998
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 19999
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 19999
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 689642
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/localized_narratives.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 70939
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 7093
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 7093
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 696735
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/mimic_cgd.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 7619
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 761
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 761
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 697496
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/multihiertt.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 50426
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 5042
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 5042
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 702538
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/nlvr2.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 165746
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 16574
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 16574
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 719112
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/ocrvqa.jsonl
+WARNING:cognitron_vl.data.dataset_base:Data file no found datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/okvqa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 157070
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 15707
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 15707
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 734819
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/plotqa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 42000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4200
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 4200
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 739019
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/raven.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 10000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 740019
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/rendered_text.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 8514
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 851
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 851
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 740870
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/robut_sqa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 74989
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 7498
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 7498
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 748368
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/robut_wikisql.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 15730
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1573
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1573
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 749941
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/screen2words.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 8566
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 856
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 856
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 750797
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/spot_the_diff.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 22722
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2272
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2272
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 753069
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/tabmwp.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 2199
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 219
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 219
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 753288
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/tat_qa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 21953
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2195
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2195
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 755483
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/textvqa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 9969
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 996
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 996
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 756479
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/vistext.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 3027
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 302
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 302
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 756781
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/visualmrc.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 82772
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 8277
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 8277
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 765058
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/vqav2.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 10000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 766058
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/websight.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1251003
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 125100
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 125100
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 891158
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 143000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 14300
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 14300
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 905458
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/evol_instruct/evol_instruct_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 100000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 10000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 10000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 915458
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/synthdog_en/synthdog_en_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 100000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 10000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 10000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 925458
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/synthdog_zh/synthdog_zh_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 101880
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 10188
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 10188
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 935646
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/ureader_tr/ureader_tr_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 5280
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 528
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 528
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 936174
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/CLEVR-Math(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 17587
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1758
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1758
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 937932
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/FigureQA(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 498
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 49
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 49
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 937981
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/GEOS(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 17162
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1716
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1716
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 939697
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/GeoQA+(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 9724
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 972
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 972
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 940669
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/Geometry3K(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 22589
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2258
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2258
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 942927
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/IconQA(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 5225
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 522
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 522
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 943449
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/MapQA(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 35948
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3594
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3594
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 947043
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/PMC-VQA(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 8642
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 864
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 864
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 947907
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/Super-CLEVR(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 22452
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2245
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2245
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 950152
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/TabMWP(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 11949
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1194
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1194
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 951346
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/UniGeo(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 263584
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 26358
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 26358
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 977704
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/VisualWebInstruct(filtered).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 6604
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 660
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 660
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 978364
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/VizWiz(MathV360K).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 2429
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 242
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 242
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 978606
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 4864
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 486
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 486
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 979092
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(gpt4v).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 12403
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1240
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1240
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 980332
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(internvl).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 16534
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1653
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1653
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 981985
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/aokvqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 83125
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 8312
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 8312
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 990297
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/cambrian(filtered).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 18260
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1826
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1826
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 992123
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/chartqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 8825
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 882
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 882
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 993005
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/chrome_writting.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 69995
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 6999
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 6999
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1000004
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/clevr(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 199995
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 19999
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 19999
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1020003
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/dvqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 99995
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 9999
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 9999
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1030002
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/figureqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 60242
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 6024
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 6024
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1036026
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo170k(align).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 67823
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 6782
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 6782
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1042808
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo170k(qa).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 2091
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 209
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 209
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1043017
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo3k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 8495
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 849
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 849
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1043866
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hateful_memes(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 2495
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 249
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 249
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1044115
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hitab(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 74492
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 7449
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 7449
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1051564
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hme100k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 27302
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2730
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2730
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1054294
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/iconqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1990
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 199
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 199
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1054493
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/iiit5k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 99573
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 9957
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 9957
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1064450
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/image_textualization(filtered).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1982
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 198
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 198
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1064648
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic(gpt4v).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 4394
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 439
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 439
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1065087
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic_vqa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 2113
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 211
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 211
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1065298
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic_vqa_llava_format.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 256636
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 25663
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 25663
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1090961
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/k12_printing.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 19790
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1979
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1979
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1092940
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/llavar_gpt4_20k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1776
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 177
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 177
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1093117
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/lrv_chart.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 10490
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1049
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1049
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1094166
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/lrv_normal(filtered).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 299988
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 29998
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 29998
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1124164
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(l3_80b_mt).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 299990
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 29999
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 29999
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1154163
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(l3_80b_st).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 299982
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 29998
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 29998
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1184161
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(qwen2_72b_st).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 37412
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3741
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3741
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1187902
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mapqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 29827
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2982
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2982
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1190884
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mathqa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 87348
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 8734
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 8734
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1199618
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mavis_math_metagen.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 99990
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 9999
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 9999
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1209617
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mavis_math_rule_geo.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1999
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 199
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 199
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1209816
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/orand_car_a.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 38241
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3824
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3824
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1213640
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/robut_wtq(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 4971
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 497
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 497
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1214137
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/scienceqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 19208
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1920
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1920
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1216057
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/scienceqa(nona_context).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 57284
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 5728
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 5728
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1221785
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/sharegpt4o.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 33616
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3361
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3361
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1225146
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/sroie.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 17242
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1724
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1724
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1226870
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/st_vqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 98675
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 9867
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 9867
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1236737
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/tallyqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 21942
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2194
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2194
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1238931
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/textcaps.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 25104
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2510
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2510
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1241441
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/textocr(gpt4v).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 27302
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2730
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2730
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1244171
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/tqa(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 91434
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 9143
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 9143
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1253314
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_cap.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 17322
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1732
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1732
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1255046
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_ie.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 37550
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3755
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3755
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1258801
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_kg.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 252954
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 25295
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 25295
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1284096
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_qa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 186060
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 18606
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 18606
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1302702
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vision_flan(filtered).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 14361
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1436
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1436
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1304138
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/visual7w(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 308
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 30
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 30
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1304168
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vqarad(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 2152
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 215
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 215
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1304383
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vsr(cauldron,llava_format).jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 605719
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 60571
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 60571
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1364954
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/M4-Instruct-Data/m4_instruct_annotations.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 254999
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 25499
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 25499
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1390453
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/M4-Instruct-Data/m4_instruct_video.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1001551
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 300000
+INFO:cognitron_vl.data.dataset_base:used_num 100155
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 100155
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1490608
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1030
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 103
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 103
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1490711
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/GAIR/lima/train.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 15011
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1501
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 1501
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1492212
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/databricks/databricks-dolly-15k/databricks-dolly-15k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 395000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 100000
+INFO:cognitron_vl.data.dataset_base:used_num 39500
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 39500
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1531712
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/meta-math/MetaMathQA/MetaMathQA-395K.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 262039
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 80000
+INFO:cognitron_vl.data.dataset_base:used_num 26203
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 26203
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1557915
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/TIGER-Lab/MathInstruct/MathInstruct.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 200035
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 50000
+INFO:cognitron_vl.data.dataset_base:used_num 20003
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 20003
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1577918
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/microsoft/orca-math-word-problems-200k/data.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 17807579
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 100000
+INFO:cognitron_vl.data.dataset_base:used_num 100000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 100000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1677918
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/AtlasUnified/atlas-math-sets/train.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1746300
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 30000
+INFO:cognitron_vl.data.dataset_base:used_num 30000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 30000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1707918
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/tiedong/goat/dataset.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 50000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 30000
+INFO:cognitron_vl.data.dataset_base:used_num 5000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 5000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1712918
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/camel-ai/math/math.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1214
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1214
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 1214
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1714132
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_en.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 3439
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3439
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 3439
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1717571
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_en_paraphrasing.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1336
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1336
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 1336
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1718907
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_en.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1574
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1574
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 1574
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1720481
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_alpaca_en.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 733
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 733
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 733
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1721214
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_en_paraphrasing.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 677
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 677
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 677
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1721891
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/translation_en2zh.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 460
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 460
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 460
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1722351
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_zh.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 4262
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4262
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 4262
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1726613
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_zh_paraphrasing.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 842
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 842
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 842
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1727455
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_zh.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1138
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1138
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 1138
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1728593
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_llama_chinese.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 791
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 791
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 791
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1729384
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_zh_paraphrasing.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 23652
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 23652
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 23652
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1753036
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/akoksal/LongForm/data.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 9888
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 9888
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 9888
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1762924
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/THUDM/LongAlign-10k/long.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 44600
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 44600
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 44600
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1807524
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/THUDM/LongCite-45k/long.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 6000
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 6000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 6000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1813524
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/THUDM/LongWriter-6k/long.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 38821
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 38821
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 38821
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1852345
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/YeungNLP/LongQLoRA-Dataset/LongQLoRA-SFT-Data-39k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 12000
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 12000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 12000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1864345
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/Yukang/LongAlpaca-12k/LongAlpaca-12k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 88957
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 88957
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 88957
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1953302
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/togethercomputer/Long-Data-Collections/natural_questions_10_200_docs.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 9600
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 9600
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'source', 'images', 'videos'],
+    num_rows: 9600
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1962902
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/togethercomputer/Long-Data-Collections/booksum.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 6884
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 688
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 688
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1963590
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/caption_videochat.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 39949
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3994
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 3994
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1967584
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/classification_k710.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 40000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 4000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1971584
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/classification_ssv2.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 33923
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3392
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 3392
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1974976
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochat1.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 100010
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 10001
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 10001
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1984977
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochatgpt.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 42620
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4262
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 4262
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1989239
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_clevrer_mc.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 40000
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 4000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1993239
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_clevrer_qa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 34132
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3413
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 3413
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1996652
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_next_qa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 25803
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2580
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 2580
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 1999232
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vcg_human_annotated.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 112716
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 11271
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 11271
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2010503
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vcg-plus_112K.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 99923
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 9992
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 9992
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2020495
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vqa_webvid_qa.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 221966
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 22196
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 22196
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2042691
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Share14/ShareGemini/sharegemini_k400.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 101781
+INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 10178
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 10178
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2052869
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Share14/ShareGemini/sharegemini_webvid_core100k.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 8959
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 8959
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 8959
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2061828
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Comic/summary.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 11985
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 11985
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 11985
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2073813
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_v0_1_cap_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 79346
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 79346
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 79346
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2153159
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_cap_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 5753
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 5753
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 5753
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2158912
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_mc_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 48468
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 48468
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 48468
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2207380
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_oe_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 2950
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 2950
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 2950
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2210330
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_activitynetqa/0_30_s_activitynetqa_oe_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 5496
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 5496
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 5496
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2215826
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_nextqa/0_30_s_nextqa_mc_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 5492
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 5492
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 5492
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2221318
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_nextqa/0_30_s_nextqa_oe_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 1785
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 1785
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 1785
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2223103
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_perceptiontest/0_30_s_perceptiontest_mc_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 39353
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 39353
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 39353
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2262456
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_mc_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 420200
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 420200
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 420200
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2682656
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_oe_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 4627
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4627
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 4627
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2687283
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_v0_1_cap_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 24234
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 24234
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 24234
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2711517
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_v0_1_cap_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 4241
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4241
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 4241
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2715758
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_mc_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 26302
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 26302
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 26302
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2742060
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_oe_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 8590
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 8590
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 8590
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2750650
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_activitynetqa/1_2_m_activitynetqa_oe_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 4633
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4633
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 4633
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2755283
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_mc_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 4694
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4694
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 4694
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2759977
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_oe_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 39710
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 39710
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 39710
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2799687
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_mc_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 137645
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 137645
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 137645
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2937332
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_oe_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 3124
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 3124
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 3124
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2940456
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_v0_1_cap_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 24685
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 24685
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 24685
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2965141
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_v0_1_cap_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 6901
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 6901
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 6901
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2972042
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_mc_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 18134
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 18134
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 18134
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2990176
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_oe_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 7460
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 7460
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 7460
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2997636
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_activitynetqa/2_3_m_activitynetqa_oe_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 52
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 52
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 52
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2997688
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_nextqa/2_3_m_nextqa_mc_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 61
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 61
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 61
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 2997749
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_nextqa/2_3_m_nextqa_oe_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 39967
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 39967
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 39967
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3037716
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_mc_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 141495
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 141495
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 141495
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3179211
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_oe_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 10514
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 10514
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 10514
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3189725
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_v0_1_cap_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 19995
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 19995
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 19995
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3209720
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_v0_1_cap_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 20346
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 20346
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 20346
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3230066
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_mc_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 57924
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 57924
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 57924
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3287990
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_oe_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 4530
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 4530
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 4530
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3292520
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_activitynetqa/30_60_s_activitynetqa_oe_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 6843
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 6843
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 6843
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3299363
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_nextqa/30_60_s_nextqa_mc_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 6843
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 6843
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 6843
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3306206
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_nextqa/30_60_s_nextqa_oe_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 618
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 618
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 618
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3306824
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_perceptiontest/30_60_s_perceptiontest_mc_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 39927
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 39927
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 39927
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3346751
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_mc_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 110624
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 110624
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 110624
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3457375
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_oe_v0_1_qa_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:total_num 255000
+INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
+INFO:cognitron_vl.data.dataset_base:data_num 999999999
+INFO:cognitron_vl.data.dataset_base:used_num 255000
+INFO:cognitron_vl.data.dataset_base:this_data Dataset({
+    features: ['conversations', 'videos', 'source', 'images'],
+    num_rows: 255000
+})
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3712375
+})
+INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/llava_hound/sharegptvideo_qa_255k_processed.jsonl
+INFO:cognitron_vl.data.dataset_base:output_xlsx /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//lcvlm_finetune_stage3.xlsx
+[h264 @ 0x563d7a5b3340] mmco: unref short failure
+[h264 @ 0x563d7e500440] mmco: unref short failure
+[h264 @ 0x563d7e500440] mmco: unref short failure
+[h264 @ 0x563d7e500440] mmco: unref short failure
+INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
+    features: ['conversations', 'images', 'source', 'videos'],
+    num_rows: 3712375
+})
+INFO:cognitron_vl.data.dataset_base:raw_data {'conversations': [[{'from': 'human', 'value': '<video>\nWhat does the man do after peeling off the backing of the foam tape?'}, {'from': 'gpt', 'value': 'He starts applying the foam tape around the edges of the unit, ensuring a proper seal.'}, {'from': 'human', 'value': 'Where is the yellow curtain located in relation to the air conditioning unit?'}, {'from': 'gpt', 'value': 'The yellow curtain is located above the air conditioning unit.'}, {'from': 'human', 'value': 'Why does the man press the foam tape along the edges of the air conditioning unit?'}, {'from': 'gpt', 'value': 'To ensure it adheres properly and creates a tight seal.'}, {'from': 'human', 'value': 'Where does the video take place?'}, {'from': 'gpt', 'value': 'The video takes place in front of a white Haier air conditioning unit with a yellow curtain above it.'}, {'from': 'human', 'value': 'What is the man wearing in the video?'}, {'from': 'gpt', 'value': 'The man is wearing a red cap and a dark blue hoodie.'}], [{'from': 'user', 'value': 'What is the balanced chemical equation for the decomposition of sodium bicarbonate (baking soda) when heated?'}, {'from': 'assistant', 'value': 'When sodium bicarbonate (NaHCO3) is heated, it decomposes into sodium carbonate (Na2CO3), water (H2O), and carbon dioxide (CO2). The balanced chemical equation for this decomposition reaction is:\n\n2 NaHCO3 (s) → Na2CO3 (s) + H2O (g) + CO2 (g)'}], [{'from': 'user', 'value': '<image>\nWhat is the name of the product described in the text?'}, {'from': 'assistant', 'value': 'The name of the product is Port & Company® Fleece-Lined Knit Cap, model CP90L.'}, {'from': 'user', 'value': 'What improvements were made to the popular knit cap style?'}, {'from': 'assistant', 'value': 'The popular knit cap style was improved by fully lining it with 100% polyester fleece for additional warmth and comfort.'}, {'from': 'user', 'value': 'What materials are used in the fabric and lining of the knit cap?'}, {'from': 'assistant', 'value': 'The knit cap is made of 100% acrylic fabric with 100% polyester fleece lining.'}, {'from': 'user', 'value': 'What are the care instructions for the fleece-lined knit cap?'}, {'from': 'assistant', 'value': 'The care instructions for the fleece-lined knit cap are to hand wash and lay flat to dry.'}, {'from': 'user', 'value': 'Can you list some of the colors available for the fleece-lined knit cap?'}, {'from': 'assistant', 'value': 'The colors available for the fleece-lined knit cap include Athletic Green, Athletic Orange, Athletic Oxford, Athletic Red, Athletic Royal Black, Maroon, Navy, Neon Pink, Glo Neon Yellow, Neon Yellow, and White.'}, {'from': 'user', 'value': 'What is the PMS color code for Athletic Green?'}, {'from': 'assistant', 'value': 'The PMS color code for Athletic Green is 627C.'}, {'from': 'user', 'value': 'What color has no available PMS match?'}, {'from': 'assistant', 'value': 'The color that has no available PMS match is Royal Black.'}], [{'from': 'user', 'value': 'Can you provide a Rust program that takes user input and displays it back?'}, {'from': 'assistant', 'value': 'Sure! Here\'s a Rust program that accomplishes that:\n\n```rust\nuse std::io;\n\nfn main() {\n    println!("Please enter your input:");\n\n    let mut input = String::new();\n\n    io::stdin().read_line(&mut input).expect("Failed to read line");\n\n    println!("You entered: {}", input);\n}\n```\n\nExplanation: \nIn this program, we import the `io` module from the `std` (standard) library to handle input and output operations. \n\nFirst, we print a prompt message asking the user to enter their input using `println!("Please enter your input:")`.\n\nThen, we create a mutable string variable named `input` to store the user\'s input. We use `String::new()` to initialize it.\n\nNext, we use `io::stdin().read_line(&mut input)` to read the user\'s input and store it in the `input` variable. \n\nThe `read_line` function takes a mutable reference to `input` to store the input into the variable. If the `read_line` operation fails, it will raise an error using `expect("Failed to read line")`.\n\nFinally, we display the user\'s input back to them using `println!("You entered: {}", input)`.\n\nNow, when you run this program, it will prompt you to enter your input. After you provide the input and press enter, it will display the input back to you.'}], [{'from': 'human', 'value': 'Analyze the image in a comprehensive and detailed manner.\n<image>'}, {'from': 'gpt', 'value': 'In the heart of a serene landscape, a vintage biplane is at rest on a lush, grassy field. The plane, facing towards the right side of the image, boasts a striking color scheme with a yellow top wing and a gray bottom wing. The tail of the plane is marked with the letters "BH", and a black and white cross adorns its side, adding to its vintage charm. \n\nThe plane\'s propeller, poised for action, is clearly visible at the front. The wheels of the plane are firmly planted on the ground, ready for the next adventure. In the background, a tranquil scene unfolds with verdant trees standing tall against the blue sky dotted with fluffy clouds. The entire scene paints a picture of a bygone era, evoking a sense of nostalgia and adventure.'}], [{'from': 'user', 'value': 'How many days in July were spent in the lab for the mentioned studies?\n<image>'}, {'from': 'assistant', 'value': 'There were 10 days in the lab from July 1st to 10th and 5 days in the lab from July 12th to 16th. In total, there were 15 days spent in the lab for the mentioned studies.'}, {'from': 'user', 'value': 'On which dates in July 2021 was the lab closed?'}, {'from': 'assistant', 'value': 'The lab was closed on weekends (Saturday and Sunday), and specific dates in July when the lab was closed are as follows: July 4th, 11th, 18th, and 25th.'}, {'from': 'user', 'value': 'How many days in August were spent in the lab for the mentioned studies?'}, {'from': 'assistant', 'value': 'There were 10 days in the lab from August 2nd to August 11th, 5 days in the lab from August 9th to August 13th, and 5 days in the lab from August 16th to August 20th. In total, there were 20 days spent in the lab for the mentioned studies.'}, {'from': 'user', 'value': 'What is the protocol for rush cases and for cases over 6 units in the lab?'}, {'from': 'assistant', 'value': 'For rush cases, you should call the lab to coordinate. For cases over 6 units, please allow for 5 additional days in the lab.'}, {'from': 'user', 'value': 'What is the contact number for the lab?'}, {'from': 'assistant', 'value': 'The contact number for the lab is 734-316-7053.'}], [{'from': 'human', 'value': 'Explore the intriguing world of creating aged balsamic vinegar from Modena, Italy, using traditional methods passed down through generations. Delve into the meticulous process of selecting the finest grapes, the specific timing of their harvest, and the secrets behind the aging process that results in a unique and complex flavor profile. Reflect on the cultural significance of this artisanal product in Italian cuisine and showcase your understanding of the intricate craftsmanship required to produce such a rare and highly sought-after ingredient.'}, {'from': 'gpt', 'value': 'Balsamic vinegar is a staple in Italian cuisine, but not all balsamic vinegar is created equal. The aged balsamic vinegar from Modena, Italy, is a true artisanal product that takes years of meticulous care to produce. This vinegar is made from the must, or juice, of selected grapes grown in the Modena region. The must is then cooked and aged in a succession of barrels made from different types of wood. The final product is a thick, syrupy vinegar with a complex flavor profile that is a result of the unique aging process.\n\nThe process of creating aged balsamic vinegar begins with selecting the right grapes. The grape varieties most commonly used are Trebbiano and Lambrusco. The grapes are carefully picked at the perfect ripeness, usually in September or October. They are then crushed, and the must is extracted. The must is then cooked in an open vat over a wood fire until it is reduced by half. This process can take up to 24 hours.\n\nAfter the cooking process, the must is then transferred to a series of wooden barrels. The barrels are made from various types of wood, including oak, chestnut, cherry, and ash. Each barrel has a specific size and shape that affects the final flavor of the vinegar. The barrels are stacked on top of each other, and the must is transferred from barrel to barrel, with the smallest barrel at the top and the largest at the bottom. The vinegar is aged in these barrels for a minimum of 12 years, and sometimes up to 25 years or more.\n\nThe aging process is critical to the flavor of the vinegar. As the vinegar ages, it absorbs the flavors and aromas of the wood, as well as the vinegar that was previously stored in the barrel. The vinegar also evaporates over time, becoming thicker and more syrupy. The final product is a rich, complex vinegar with a sweet, tangy flavor that is perfect for use in salads, marinades, and sauces.\n\nAged balsamic vinegar is a cherished product in Modena, Italy, and is often given as a gift to friends and family. It is also used in traditional Italian dishes, such as Parmigiano-Reggiano cheese, roasted meats, and fresh fruit. The art of creating aged balsamic vinegar has been passed down through generations and is a source of pride for the people of Modena.\n\nIn conclusion, creating aged balsamic vinegar is a true labor of love that requires patience, skill, and a deep respect for tradition. The result is a unique and complex vinegar that is a testament to the rich culinary heritage of Italy.'}], [{'from': 'human', 'value': '<image>\nPlease enlighten me with a detailed description of the image.'}, {'from': 'gpt', 'value': 'The image displays a corner of a room designed as a makeshift art gallery for children\'s artwork. On the wall, there\'s a sign reading "Nikolaus Fine Art Gallery" in a decorative font. Below the sign, several pieces of paper are attached to three horizontal white strings stretched across the wall. The papers feature various child-like drawings and paintings, some with vibrant colors and others in pencil or crayon. On the left, a red circular object sits on the floor, possibly a bucket. In front of the wall, there is a chair with a dark brown seat and backrest, a white pillow with an airplane silhouette, and a black throw on one side. The chair sits atop a beige carpet, and behind it is a curtain divided into alternating stripes of black and beige. On the right edge of the image, there\'s a watermark or logo that reads "classy clutter."'}], [{'from': 'human', 'value': '<image>\nPlease generate detailed descriptions of the given image.'}, {'from': 'gpt', 'value': "The image captures a dynamic moment in a soccer match. The central figure is a soccer player, donned in a white jersey with the number 9 emblazoned on it. He is in the midst of a powerful kick, his right foot extended towards the ball. The ball, a vibrant mix of blue and white, is in mid-air, just a few feet away from his foot, indicating the action of the kick.\n\nThe player's body is angled towards the right side of the image, suggesting a strong, forward-moving force behind his kick. His left arm is extended outwards, possibly for balance or to signal a teammate.\n\nIn the background, two other players can be seen. One is wearing a white jersey with the number 13, while the other sports a white jersey with the number 5. They are both positioned on the left side of the image, their attention focused on the ongoing action.\n\nThe setting is a soccer field, with a goal post visible in the background. The field is lush and green, providing a stark contrast to the players' white jerseys. The goal post stands tall and imposing, a silent sentinel overseeing the unfolding match.\n\nOverall, the image is a snapshot of a thrilling moment in a soccer game, filled with anticipation and excitement."}], [{'from': 'user', 'value': '<image>\nWhat does the document present?'}, {'from': 'assistant', 'value': 'The document presents a Condensed Consolidated Statement of Grant County Bank at the closing of business on March 31, 2020.'}, {'from': 'user', 'value': 'What is the total amount of Cash and Due from Banks on March 31, 2020?'}, {'from': 'assistant', 'value': 'The total amount of Cash and Due from Banks on March 31, 2020, is $7,599,000.'}, {'from': 'user', 'value': 'What is the total amount of Municipal Securities owned by Grant County Bank?'}, {'from': 'assistant', 'value': 'The total amount of Municipal Securities owned by Grant County Bank is $24,335,000.'}, {'from': 'user', 'value': 'What was the total value of Loans on March 31, 2020, at Grant County Bank?'}, {'from': 'assistant', 'value': 'The total value of Loans at Grant County Bank on March 31, 2020, was $150,150,000.'}, {'from': 'user', 'value': 'How much is the Reserve for Loan Losses at Grant County Bank?'}, {'from': 'assistant', 'value': 'The Reserve for Loan Losses at Grant County Bank is $3,457,000.'}, {'from': 'user', 'value': 'What is the total value of Bank Building, Furniture, and Fixtures?'}, {'from': 'assistant', 'value': 'The total value of Bank Building, Furniture, and Fixtures is $5,232,000.'}, {'from': 'user', 'value': 'How much Capital does Grant County Bank have?'}, {'from': 'assistant', 'value': 'Grant County Bank has $1,000,000 in Capital.'}, {'from': 'user', 'value': 'Who is the CEO of Grant County Bank?'}, {'from': 'assistant', 'value': 'The CEO of Grant County Bank is Thomas J. Pinnick.'}, {'from': 'user', 'value': 'Who are the Vice Presidents of Customer Service and Human Resources at Grant County Bank?'}, {'from': 'assistant', 'value': 'The Vice Presidents of Customer Service and Human Resources at Grant County Bank are Trisha Walker and Beverly G. Ketchum, respectively.'}, {'from': 'user', 'value': 'What is the total amount of deposits in Grant County Bank?'}, {'from': 'assistant', 'value': 'The total amount of deposits in Grant County Bank is $226,917,000.'}, {'from': 'user', 'value': 'How many directors does Grant County Bank have, and who are they?'}, {'from': 'assistant', 'value': 'Grant County Bank has five directors: Thomas J. Pinnick (Chairman), Joseph M. Sullivan, Dr. Robert V. Pinnick, and Marieta A. Hauser.'}, {'from': 'user', 'value': 'How much is Other Borrowed Money at Grant County Bank?'}, {'from': 'assistant', 'value': 'Other Borrowed Money at Grant County Bank is $8,149,000.'}, {'from': 'user', 'value': 'How many Vice Presidents of Consumer Loans are listed in the document?'}, {'from': 'assistant', 'value': 'There are three Vice Presidents of Consumer Loans listed in the document: Cynthia L. Davis, John Nickel, and Josh Sullivan.'}, {'from': 'user', 'value': 'Who is the Branch Manager of Grant County Bank?'}, {'from': 'assistant', 'value': 'The Branch Manager of Grant County Bank is Randy Bartel.'}]], 'images': [None, None, ['datasets/LMM/HuggingFaceM4/Docmatix/images/363881f85b1c96eed9de15d98780a64e.png'], None, ['datasets/CV/coco//train2017/000000058563.jpg'], ['datasets/LMM/HuggingFaceM4/Docmatix/images/05f2cfb5a4b71c172f4a16bda7b9ecd7.png'], None, ['/data/data/LMM/FreedomIntelligence/ALLaVA-4V/allava_laion/images/100275960.jpeg'], ['datasets/LMM/lmms-lab/LLaVA-ReCap-CC3M/images/002411038.png'], ['datasets/LMM/HuggingFaceM4/Docmatix/images/bdce105da9887773e154ee7e4bea95f8.png']], 'source': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_oe_v0_1_qa_processed.jsonl', 'datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl', 'datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl', 'datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl', 'datasets/LLM/jsons/YeungNLP/LongQLoRA-Dataset/LongQLoRA-SFT-Data-39k.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Caption-LAION-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-CC3M/data.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl'], 'videos': [['datasets/LMM/lmms-lab/LLaVA-Video-178K/liwei_youtube_videos/videos/youtube_video_2024/ytb_oOyURLf4ytI.mp4'], None, None, None, None, None, None, None, None, None]}
+INFO:cognitron_vl.data.dataset_base:raw_data {'conversations': [[{'from': 'user', 'value': "I've been exploring ways to generate unique IDs in my program, and I specifically need the IDs to be 8 digits long. I recently discovered that I can set an ID auto increment field to start from a specific number. If I set it to start from 11111111, will all new records increment from this value? If so, this would provide me with my desired unique 8-digit ID without the need for a function. Additionally, if a record is deleted, will a new record reuse that deleted ID?"}, {'from': 'assistant', 'value': '1) Autoincrement values are not reused in most database systems. Once an ID is assigned to a record, it will not be reused, even if the record is deleted.\n\n2) Yes, you are correct! By setting the autoincrement counter to a starting value of 11111111 (or any 8-digit number), new records will increment from this value. This can be achieved by using the table option "AUTO_INCREMENT = N" in your database.\n\nHere is an example of how you can implement this in MySQL:\n\n```sql\nCREATE TABLE my_table (\n  id INT AUTO_INCREMENT PRIMARY KEY,\n  data VARCHAR(255)\n) AUTO_INCREMENT = 11111111;\n```\n\nIn this example, the "id" column will start from 11111111 for the first record and increment by 1 for each new record.\n\nBy using this approach, you can generate unique 8-digit IDs without the need for a separate function.'}], [{'from': 'human', 'value': 'Write a high-quality answer for the given question using only the provided search results (some of which might be irrelevant).\n\nDocument [1] (Title: Hindi) Trinidad and Tobago; 3,000 in Singapore. Linguistically, Hindi and Urdu are two registers of the same language and are mutually intelligible. Hindi is written in the Devanagari script and uses more Sanskrit words, whereas Urdu is written in the Perso-Arabic script and uses more Arabic and Persian words. Hindi is the most commonly used official language in India. Urdu is the and "lingua franca" of Pakistan and is one of 22 official languages of India. The splitting of Hindi and Urdu into separate languages is largely motivated by politics, namely the Indo-Pakistani rivalry. Hindi is written in the Devanagari script,\nDocument [2] (Title: Anti-Hindi agitations of Tamil Nadu) school activities during Hindi classes. The Indian Constituent Assembly was established on 9 December 1946, for drafting a Constitution when India became independent. The Constituent Assembly witnessed fierce debates on the language issue. The adoption of a National Language, the language in which the Constitution was to be written in and the language in which the proceedings of the Constituent Assembly were to be conducted were the main linguistic questions debated by the framers of the Constitution. On one side were members from the "Hindi-speaking provinces" like Algu Rai Sastri, R.V. Dhulekar, Balkrishna Sharma, Purushottam Das Tandon, (all from United\nDocument [3] (Title: Hindi–Urdu controversy) done despite the fact that the great majority of people in the northern plains of India were Hindu with an affinity for Hindi and for the Devanagri script. Why exactly this decision was taken is unclear. The most immediate reason for the controversy is believed to be the contradictory language policy in North India in the 1860s. Although the then government encouraged both Hindi and Urdu as a medium of education in school, it discouraged Hindi or Nagari script for official purposes. This policy gave rise to conflict between students educated in Hindi or Urdu for the competition of government\nDocument [4] (Title: Languages of India) exactly the opposite. In the event, as 1965 approached, India\'s new Prime Minister Lal Bahadur Shastri prepared to make Hindi paramount with effect from 26 January 1965. This led to widespread agitation, riots, self-immolations and suicides in Tamil Nadu. The split of Congress politicians from the South from their party stance, the resignation of two Union ministers from the South and the increasing threat to the country\'s unity forced Shastri to concede. As a result, the proposal was dropped, and the Act itself was amended in 1967 to provide that the use of English would not be ended until a\nDocument [5] (Title: Hindi media) Hindi media Hindi media refers to media in Hindi language and its dialects, across the Hindi belt in India, and elsewhere with the Hindi-speaking Indian diaspora. The first Hindi-language newspaper published in India, "Udant Martand" (The Rising Sun), started on 30 May 1826. This day is celebrated as "Hindi Journalism Day", or "Hindi Patrakarita Diwas", as it marked the beginning of journalism in Hindi language. India has a long history of printing. The first printing press was set up as early as in 1674 in Mumbai (Bombay). "Calcutta General Advertiser", the first newspaper of India (also known as the "Hicky\'s\nDocument [6] (Title: Languages of India) thought to be a violation of the constitution\'s guarantee of federalism. Later, a constitutional amendment, , allowed for the continuation of English in the Indian government indefinitely until legislation decides to change it. The form of numerals to be used for the official purposes of the Union were supposed to be the international form of Indian numerals, distinct from the numerals used in most English-speaking countries. Despite the misconceptions, Hindi is not the national language of India. The Constitution of India does not give any language the status of national language. The Eighth Schedule of the Indian Constitution lists 22\nDocument [7] (Title: Indira Gandhi) social classes challenged the political establishment of the Hindi Belt in the years to come. Under the Constitution of India of 1950, Hindi was to have become the official national language by 1965. This was not acceptable to many non-Hindi speaking states, who wanted the continued use of English in government. In 1967, Gandhi made a constitutional amendment that guaranteed the de facto use of both Hindi and English as official languages. This established the official government policy of bilingualism in India and satisfied the non-Hindi speaking Indian states. Gandhi thus put herself forward as a leader with a pan-Indian\nDocument [8] (Title: Languages with official status in India) Languages with official status in India The Constitution of India designates the official language of the Government of India as Hindi written in the Devanagari script, as well as English. There is no national language as declared by the Constitution of India. Hindi is used for official purposes such as parliamentary proceedings, judiciary, communications between the Central Government and a State Government. States within India have the liberty and powers to specify their own official language(s) through legislation and therefore there are 22 officially recognized languages in India of which Hindi is the most used. The number of native Hindi\nDocument [9] (Title: Hindi media) the online versions of the Hindi newspapers and news channels. Hindi media Hindi media refers to media in Hindi language and its dialects, across the Hindi belt in India, and elsewhere with the Hindi-speaking Indian diaspora. The first Hindi-language newspaper published in India, "Udant Martand" (The Rising Sun), started on 30 May 1826. This day is celebrated as "Hindi Journalism Day", or "Hindi Patrakarita Diwas", as it marked the beginning of journalism in Hindi language. India has a long history of printing. The first printing press was set up as early as in 1674 in Mumbai (Bombay). "Calcutta General Advertiser",\nDocument [10] (Title: Biharis) the subordinate label of Hindi in the 1961 Census. Such state and national politics are creating conditions for language endangerment. The first success for spreading Hindi occurred in Bihar in 1881, when Hindi displaced Urdu as the sole official language of the province. In this struggle between competing Hindi and Urdu, the potential claims of the three large mother tongues in the region – Bhojpuri, Maithili and Magahi were ignored. After independence Hindi was again given the sole official status through the Bihar Official Language Act, 1950. Urdu became the second official language in the undivided State of Bihar on\nDocument [11] (Title: Anti-Hindi agitations of Tamil Nadu) sole official language and to phase out the use of English (Article 344). Official communication between states and between states and the Union would be in the official language of the union (Article 345). English would be used for all legal purposes – in court proceedings, bills, laws, rules and other regulations (Article 348). The Union was duty bound to promote the spread and usage of Hindi (Article 351). India became independent on 15 August 1947 and the Constitution was adopted on 26 January 1950. The adoption of English as official language along with Hindi was heavily criticized by pro-Hindi\nDocument [12] (Title: Hindi) Uttar Pradesh, for instance, depending on the political formation in power, this language is generally Urdu. Similarly, Hindi is accorded the status of official language in the following Union Territories: Andaman & Nicobar Islands, Chandigarh, Dadra & Nagar Haveli, Daman & Diu, National Capital Territory. National language status for Hindi is a long-debated theme. In 2010, the Gujarat High Court clarified that Hindi is not the national language of India because the constitution does not mention it as such. Outside Asia, the Awadhi language (A Hindi dialect) with influence from Bhojpuri, Bihari languages, Fijian and English is spoken in Fiji.\nDocument [13] (Title: Anti-Hindi agitations of Tamil Nadu) lathi charge when the agitation grew violent. In Andhra Pradesh, trains were damaged and colleges were shut down. Efforts to amend the Official Languages Act according to Shastri\'s assurances given in February 1965 faced stiff resistance from the pro-Hindi lobby. On 16 February 55 MPs from 8 different states publicly expressed their disapproval of any change in the Language policy. On 19 February 19 MPs from Maharashtra and Gujarat voiced their opposition for change and on 25 February 106 Congress MPs met the Prime Minister to request him not to amend the Act. However, Congress MPs from Madras did not\nDocument [14] (Title: Anti-Hindi agitation of 1937–40) the death of two protesters and the arrest of 1,198 persons including women and children. The mandatory Hindi education was later withdrawn by the British Governor of Madras Lord Erskine in February 1940 after the resignation of the Congress government in 1939. The Republic of India has hundreds of languages. According to the Census of 2001, there are 1,635 rationalized mother tongues and 122 languages with more than 10,000 speakers. During the British Raj, English was the official language. When the Indian Independence Movement gained momentum in the early part of the 20th century, efforts were undertaken to make Hindustani\nDocument [15] (Title: Hindustani language) declared by the Constitution of India as the "official language (राजभाषा, "rājabhāśā") of the Union" (Art. 343(1)) (In this context, "Union" means the Federal Government and not the entire country – India has 23 official languages). At the same time, however, the definitive text of federal laws is officially the English text and proceedings in the higher appellate courts must be conducted in English. At the state level, Hindi is one of the official languages in 9 of the 29 Indian states and three Union Territories (respectively, Uttar Pradesh, Bihar, Jharkhand, Uttarakhand, Madhya Pradesh, Rajasthan, Chhattisgarh, Himachal Pradesh, and Haryana;\nDocument [16] (Title: Raghunath Vinayak Dhulekar) as India\'s national language. The bill was passed and ruled that Hindi would become the nation\'s official language in 1965. However, Hindi was never made the national language as a result of the Anti-Hindi agitations of Tamil Nadu. In 1946, Dhulekar was elected as a member of Constituent Assembly of India. From 1952 to 1957, he served one term as a member of the Parliament of India,1st Lok Sabha. From 1958 to 1964, he was also elected as Chairman of Uttar Pradesh Legislative Council. Raghunath Vinayak Dhulekar Pandit Raghunath Vinayak Dhulekar (7 January 1891 in Jhansi, Uttar Pradesh – 1980)\nDocument [17] (Title: Madras State) make Tamil the medium of instruction in colleges, saying that it was "not a practical proposition, not in the interests of national integration, not in the interests of higher education, and not in the interests of the students themselves". On 7 March 1964, at a session of the Madras Legislative Assembly, Bhaktavatsalam recommended the introduction of a three-language formula comprising English, Hindi and Tamil. As 26 January 1965, the day when the 15-year-long transition period recommended by the Indian Parliament came to an end, neared, the agitations intensified, leading to police action and casualties. Five of the agitators (Sivalingam, Aranganathan,\nDocument [18] (Title: Lal Bahadur Shastri) appointed Indira Gandhi, daughter of Jawaharlal Nehru and former Congress President, as the Minister of Information and Broadcasting. Gulzarilal Nanda continued as the Minister of Home Affairs. Lal Bahadur Shastri\'s tenure witnessed the Madras anti-Hindi agitation of 1965. The government of India had for a long time made an effort to establish Hindi as the sole national language of India. This was resisted by the non-Hindi speaking states particularly Madras State. To calm the situation, Shastri gave assurances that English would continue to be used as the official language as long the non-Hindi speaking states wanted. The riots subsided after\nDocument [19] (Title: Anti-Hindi agitations of Tamil Nadu) conduct of the public services exam in all regional languages. These decisions were agreed upon during the Chief Ministers\' meeting which was held on 24 February. The three language formula was not strictly enforced either in South or Hindi-speaking areas. The changes to public services exams were impractical and not well received by government officials. The only real concession to the south was the assurance that the Official Languages Act would be modified. However, any effort to follow through with that pledge received stiff resistance. In April 1965, a meeting of a cabinet sub-committee comprising Gulzari lal Nanda, A. K.\nDocument [20] (Title: Hindi–Urdu controversy) and Oudh", in which, he made a compelling case for Hindi. Several Hindi movements were formed in the late 19th and early 20th century; notable among them were Nagari Pracharini Sabha formed in Banaras in 1893, Hindi Sahitya Sammelan in Allahabad in 1910, Dakshina Bharat Hindi Prachar Sabha in 1918 and Rashtra Bhasha Prachar Samiti in 1926. The movement was encouraged in 1881 when Hindi in Devanagari script replaced Urdu in Persian script as the official language in neighboring Bihar. They submitted 118 memorials signed by 67,000 people to the Education Commission in several cities. The proponents of Hindi argued\nDocument [21] (Title: Official languages of the United Nations) are third language speakers. Hindi is the lingua franca of the majority of Indians in the subcontinent, along with Pakistan (as Urdu), Sri Lanka, Bangladesh and Nepal, with its importance as a global language increasing day by day. Hindi is the fourth most-spoken first language in the world, after Mandarin, Spanish and English. In 2007, it was reported that the government would "make immediate diplomatic moves to seek the status of an official language for Hindi at the United Nations". According to a 2009 press release from its Ministry of External Affairs, the Government of India has been "working actively"\nDocument [22] (Title: Hindi) Standard Hindi is based on the Khariboli dialect, the vernacular of Delhi and the surrounding region, which came to replace earlier prestige dialects such as Awadhi, Maithili (sometimes regarded as separate from the Hindi dialect continuum) and Braj. "Urdu" – another form of Hindustani – acquired linguistic prestige in the later Mughal period (1800s), and underwent significant Persian influence. Modern Hindi and its literary tradition evolved towards the end of the 18th century. However, modern Hindi\'s earlier literary stages before standardization can be traced to the 16th century. In the late 19th century, a movement to further develop Hindi as\nDocument [23] (Title: Culture of India) languages in the 8th schedule of Indian constitution. Oriya\'s importance to Indian culture, from ancient times, is evidenced by its presence in Ashoka\'s Rock Edict X, dated to be from 2nd century BC. The language with the largest number of speakers in India is Hindi and it\'s various dialects. Early forms of present-day Hindustani developed from the Middle Indo-Aryan "apabhraṃśa" vernaculars of present-day North India in the 7th–13th centuries. During the time of Islamic rule in parts of India, it became influenced by Persian. The Persian influence led to the development of Urdu, which is more Persianized and written in\nDocument [24] (Title: Dravidian parties) India would come to an end and Hindi would become the sole official language. However, the Madras Anti-Hindi agitation in 1965 compelled the Central Government in India change its language policy, allowing English to continue as an official language. Although DMK was not directly involved in the violence that marred the agitation, the protest itself catapulted DMK to political power in the State in the 1967 legislative elections. Annadurai became the first non-Congress Chief Minister of the post-1950 Madras state as a result. The electoral victory in 1967 led to an electoral fusion among the non-Congress parties to avoid a\nDocument [25] (Title: Languages with official status in India) offices. In 2016, the Modi government announced plans to promote Hindi in government offices in Southern and Northeast India. The Indian constitution does not specify the official languages to be used by the states for the conduct of their official functions, and leaves each state free to, through its legislature, adopt Hindi or any language used in its territory as its official language or languages. The language need not be one of those listed in the Eighth Schedule, and several states have adopted official languages which are not so listed. Examples include Kokborok in Tripura and Mizo in Mizoram. The\nDocument [26] (Title: Hindi) dialects natively. Part XVII of the Indian Constitution deals with the official language of the Indian Commonwealth. Under Article 343, the official languages of the Union has been prescribed, which includes Hindi in Devanagari script and English: (1) The official language of the Union shall be Hindi in Devanagari script. The form of numerals to be used for the official purposes of the Union shall be the international form of Indian numerals. (2) Notwithstanding anything in clause (1), for a period of fifteen years from the commencement of this Constitution, the English language shall continue to be used for all\nDocument [27] (Title: Hindi) composite culture of India and to secure its enrichment by assimilating without interfering with its genius, the forms, style and expressions used in Hindustani and in the other languages of India specified in the Eighth Schedule, and by drawing, wherever necessary or desirable, for its vocabulary, primarily on Sanskrit and secondarily on other languages. It was envisioned that Hindi would become the sole working language of the Union Government by 1965 (per directives in Article 344 (2) and Article 351), with state governments being free to function in the language of their own choice. However, widespread resistance to the imposition\nDocument [28] (Title: National Policy on Education) 14, as stipulated by the Constitution of India, and the better training and qualification of teachers. The policy called for focus on learning of regional languages, outlining the "three language formula" to be implemented in secondary education - the instruction of the English language, the official language of the state where the school was based, and Hindi, Language education was seen as essential to reduce the gulf between the intelligentsia and the masses. Although the decision to adopt Hindi as the national language had proven controversial, the policy called for use and learning of Hindi to be encouraged uniformly to\nDocument [29] (Title: Indian English) universities modelled on the University of London and using English as the medium of instruction were established in Bombay, Calcutta and Madras. During subsequent Crown Rule in India, or the British Raj, lasting from 1858 to 1947, English language penetration increased throughout India. This was driven in part by the gradually increasing hiring of Indians in the civil services. At the time of India\'s independence in 1947, English was the only functional lingua franca in the country. After Indian Independence in 1947, Hindi was declared the first official language, and attempts were made to declare Hindi the sole national language\nDocument [30] (Title: Dravida Munnetra Kazhagam) DMK gave up the separate Dravida nation idea. The Anti-Hindi agitations of 1965 forced the central government to abandon its efforts to impose Hindi as the only official language of the country; still Hindi imposition continued as Indian government employees are asked to write as much as 65% of the letters and memoranda in Hindi. In April 1974, the DMK government brought in a resolution in the House urging the Centre to accept the Rajamannar Committee recommendations on state autonomy and amend the Constitution of India to pave the way for a true federal system. The DMK and its prime\nDocument [31] (Title: Languages with official status in India) continues to be used today, in combination with other (at the central level and in some states) State official languages at the state level. The legal framework governing the use of languages for official purpose currently includes the Constitution, the Official Languages Act, 1963, Official Languages (Use for Official Purpose of the Union) Rules, 1976, and various state laws, as well as rules and regulations made by the central government and the states. The Indian constitution, in 1950, declared Hindi in Devanagari script to be the official language of the union. Unless Parliament decided otherwise, the use of English for\nDocument [32] (Title: Part XVII of the Constitution of India) of Hindi, rather than English, as the official language of India, as well as imposing restrictions on the usage of English within the official purposes of India. The chapter also writes that a committee consisting of 30 people - 20 from the Lok Sabha (lower house) and 10 from the Rajya Sabha (upper house) should be formed. This committee is charged with ensuring that the previously mentioned responsibilities are carried out by the President. Chapter 2 covers articles 345-347, and writes that the regions of India are eligible to use any of the official languages of India for official purposes.\nDocument [33] (Title: Hindi–Urdu controversy) books as Hindi books. He gives the example of the author Premchand who wrote mainly in Urdu till 1915, until he found it difficult to publish in the language. Professor Paul R. Brass notes in his book, "Language, Religion and Politics in North India", In 1837, the British East India company replaced Persian with local vernacular in various provinces as the official language of government offices and of the lower courts. However, in the northern regions of the Indian subcontinent, Urdu in Urdu script was chosen as the replacement for Persian, rather than Hindi in the Devanagari script. This was\nDocument [34] (Title: Seventy-first Amendment of the Constitution of India) Seventy-first Amendment of the Constitution of India The Seventy-first Amendment of the Constitution of India, officially known as The Constitution (Seventy-first Amendment) Act, 1992, amended the Eighth Schedule to the Constitution so as to include Konkani, Meitei (Manipuri) and Nepali languages, thereby raising the total number of languages listed in the schedule to eighteen. The Eighth Schedule lists languages that the Government of India has the responsibility to develop. The Eighth Schedule to the Constitution originally included 14 languages. Sindhi was included by the 21st Amendment, enacted in 1967. Bodo, Dogri, Santhali and Maithili were included in the Eighth Schedule\nDocument [35] (Title: Hindi–Urdu controversy) is a literary register of the Hindustani language, derived from the Khariboli dialect of the Hindi languages. A Persianized variant of Hindustani began to take shape during the Delhi Sultanate (1206–1526 AD) and Mughal Empire (1526–1858 AD) in South Asia. Urdu, along with English, became the first official language of British India in 1850. Urdu was being written, spoken and translated to and from English in all courts, schools, official documents, and government institutes. Although the need to have a language for Hindus developed in the 1850s, the irrevocable momentum of the Hindi language movement occurred around 1880. Urdu is\nDocument [36] (Title: Eighth Schedule to the Constitution of India) and Bodo, Dogri, Maithili and Santali were added in 2003 by 92nd Constitutional Amendment Act. At present, as per Ministry of Home Affairs, there are demands for inclusion of 42 more languages in the Eighth Schedule to the Constitution. These are: Eighth Schedule to the Constitution of India The Eighth Schedule to the Constitution of India lists the official languages of the Republic of India. At the time when the Constitution was enacted, inclusion in this list meant that the language was entitled to representation on the Official Languages Commission, and that the language would be one of the bases\nDocument [37] (Title: Anti-Hindi agitations of Tamil Nadu) order. On 21 February 1940, Governor Erskine issued a press communique withdrawing compulsory Hindi teaching and making it optional. During 1946–50, there were sporadic agitations against Hindi by the Dravidar Kazhagam (DK) and Periyar. Whenever the government introduced Hindi as a compulsory language in schools, anti-Hindi protests happened and succeeded in stopping the move. The largest anti-Hindi imposition agitations in this period occurred in 1948–50. After India obtained independence in 1947, the Congress Government at the Centre urged all states to make Hindi compulsory in schools. The Congress Government of Madras Presidency under Omandur Ramasamy Reddiar complied and made Hindi\nDocument [38] (Title: Anti-Hindi agitations of Tamil Nadu) framing of the Indian Constitution after India\'s independence from the United Kingdom. After an exhaustive and divisive debate, Hindi was adopted as the official language of India with English continuing as an associate official language for a period of fifteen years, after which Hindi would become the sole official language. The new Constitution came into effect on 26 January 1950. Efforts by the Indian Government to make Hindi the sole official language after 1965 were not acceptable to many non-Hindi Indian states, who wanted the continued use of English. The Dravida Munnetra Kazhagam (DMK), a descendant of Dravidar Kazhagam, led\nDocument [39] (Title: C. Rajagopalachari) January 1965 onwards, Hindi was to become the sole official language of the Indian Union and people in non-Hindi speaking regions were compelled to learn Hindi. This led to vehement opposition and just before Republic Day, severe anti-Hindi protests broke out in Madras State. Rajagopalachari had earlier been sharply critical of the recommendations made by the Official Languages Commission in 1957. On 28 January 1956, Rajagopalachari signed a resolution along with Annadurai and Periyar endorsing the continuation of English as the official language. At an All-India Language Conference held on 8 March 1958, he declared: "Hindi is as much foreign\nDocument [40] (Title: Languages of India) resolution to that effect was passed by the legislature of every state that had not adopted Hindi as its official language, and by each house of the Indian Parliament. The Constitution of India does not give any language the status of national language. Hindi, written in Devanagari script, is the most prominent language spoken in the country. In the 2001 census, 422 million (422,048,642) people in India reported Hindi to be their native language. This figure not only included Hindi speakers of Hindustani, but also people who identify as native speakers of related languages who consider their speech to be\nDocument [41] (Title: Hindi–Urdu controversy) chose to write Hindustani in Devanagari script and Muslims and some Hindus who chose to write the same in Urdu script. The development of Hindi movements in the late nineteenth century further contributed to this divergence. Sumit Sarkar notes that in the 18th and the bulk of the 19th century, "Urdu had been the language of polite culture over a big part of north India, for Hindus quite as much as Muslims". For the decade of 1881-90, Sarkar gives figures which showed that the circulation of Urdu newspapers was twice that of Hindi newspapers and there were 55% more Urdu\nDocument [42] (Title: Hindi) Hindi Hindi (Devanagari: हिन्दी, "Hindī"), or Modern Standard Hindi (Devanagari: मानक हिन्दी, "Mānak Hindī") is a standardised and Sanskritised register of the Hindustani language. Hindi written in the Devanagari script is one of the official languages of India, along with the English language. It is one of the 22 scheduled languages of the Republic of India. However, it is not the national language of India because no language was given such a status in the Indian constitution. Hindi is the "lingua franca" of the "Hindi belt", and to a lesser extent other parts of India (usually in a simplified or\nDocument [43] (Title: Twenty-first Amendment of the Constitution of India) Dogri, Santhali and Maithali in 2004, raising the total number of languages to 22. The "Constitution (Twenty-first Amendment) Bill, 1967" (Bill No. 1 of 1967) was introduced in the Rajya Sabha on 20 March 1967. It was introduced by Yashwantrao Chavan, then Minister of Home Affairs, and sought to amend the Eighth Schedule to the Constitution to include Sindhi as one of the languages listed in the schedule. The full text of the Statement of Objects and Reasons appended to the bill is given below: The Bill was considered by the Rajya Sabha on 4 April 1967 and passed in\nDocument [44] (Title: Bihari culture) in the 1961 Census. Such state and national politics are creating conditions for language endangerment. The first success for spreading Hindi occurred in Bihar in 1881, when Hindi displaced Urdu as the sole official language of the province. In this struggle between competing Hindi and Urdu, the potential claims of the three large mother tongues in the region – Bhojpuri, Maithili and Magahi were ignored. After independence Hindi was again given the sole official status through the Bihar Official Language Act, 1950. Urdu became the second official language in the undivided State of Bihar on 16 August 1989. Bihar also\nDocument [45] (Title: Languages of India) Until the Twenty-first Amendment of the Constitution of India in 1967, the country recognised 14 official regional languages. The Eighth Schedule and the Seventy-First Amendment provided for the inclusion of Sindhi, Konkani, Meitei and Nepali, thereby increasing the number of official regional languages of India to 18. The Eighth Schedule of the Constitution of India, as of 1 December 2007, lists 22 languages, which are given in the table below together with the speaking population and the regions where they are used. The individual states, the borders of most of which are or were drawn on socio-linguistic lines, can legislate\nDocument [46] (Title: Official languages of the United Nations) to have Hindi recognized as an official language of the UN. In 2015, Nepal\'s Vice President Parmananda Jha stated his firm support for the inclusion of Hindi as an official language of the UN. Because of the linguistic diversity in India, the expansion of Hindi has become political and has led to a rise of English in the post-British colonial period. This is why English is included along with Hindi on Indian passports. If the Indian government is able to grant more federal-level protection for other languages such as Bengali, Gujarati, Marathi, Tamil, Telugu and Urdu, as a compromise then\nDocument [47] (Title: Rise of Dravidian parties to power in Tamil Nadu) the situation supported Hindi as the sole official language, although members from Bengal and Madras dissented in favour of English. As of the 1961 census, there were 1652 languages. Hindi as an official language was opposed for two reasons: the first because the numbers of people with knowledge of English were fairly evenly spread, and also that imposition of Hindi would give a major advantage in terms of job and educational possibilities to those who had Hindi as their mother tongue. In effect a Tamil who would desire to pursue union civil service would have to learn three languages, Tamil,\nDocument [48] (Title: Ninety-second Amendment of the Constitution of India) teachers, and programmes on radio and television. DMK member S. Viduthalai Virumbi stated that Hindi was not a "majority language", and demanded that the official language of a State be included as official language of the country. Verumbi was interrupted by SP member Amar Singh, who objected to opposition to Hindi, saying, "I respect Tamil but to oppose Hindi is not correct." The bill received assent from then President A. P. J. Abdul Kalam on 7 January 2004, and came into force on the same date. It was notified in "The Gazette of India" on 8 January 2004. Adivasi youths\nDocument [49] (Title: Anti-Hindi agitations of Tamil Nadu) varying degrees of success. The Republic of India has hundreds of languages. During the British Raj, English was the official language. When the Indian Independence Movement gained momentum in the early part of the 20th century, efforts were undertaken to make Hindustani as a common language to unite various linguistic groups against the British Government. As early as 1918, Mahatma Gandhi established the "Dakshin Bharat Hindi Prachar Sabha" (Institution for the Propagation of Hindi in South India). In 1925, the Indian National Congress switched to Hindustani from English for conducting its proceedings. Both Gandhi and Jawaharlal Nehru were supporters of\nDocument [50] (Title: Anti-Hindi agitations of Tamil Nadu) "Dakshina Bharat Hindi Prachara Sabha" (Institution for Propagation of Hindi in South India). On 19 December 1967, the agitation was restarted. It turned violent on 21 December and acts of arson and looting were reported in the state. Annadurai defused the situation by accepting most of their demands. On 23 January 1968, a resolution was passed in the Legislative Assembly. It accomplished the following: The Three-Language Policy was scrapped and Hindi was eliminated from the curriculum. Only English and Tamil were to be taught and the use of Hindi commands in the NCC was banned. Tamil was to be introduced\nDocument [51] (Title: Urdu) policies to counter the previous emphasis on Persian. This triggered a Hindu backlash in northwestern India, which argued that the language should be written in the native Devanagari script. This literary standard called "Hindi" replaced Urdu as the official language of Bihar in 1881, establishing a sectarian divide of "Urdu" for Muslims and "Hindi" for Hindus, a divide that was formalized with the division of India and Pakistan after independence (though there are Hindu poets who continue to write in Urdu to this day, with post-independence examples including Gopi Chand Narang and Gulzar). There have been attempts to "purify" Urdu\nDocument [52] (Title: Hindi) Hindi, most notably along with Hazari Prasad Dwivedi, Kaka Kalelkar, Maithili Sharan Gupt and Seth Govind Das who even debated in Parliament on this issue. As such, on the 50th birthday of Beohar Rajendra Simha on 14 September 1949, the efforts came to fruition following the adoption of Hindi as the official language. Now, it is celebrated as Hindi Day. In Northeast India a pidgin known as Haflong Hindi has developed as a "lingua franca" for various tribes in Assam that speak other languages natively. In Arunachal Pradesh, Hindi emerged as a lingua franca among locals who speak over 50\nDocument [53] (Title: Hindi Belt) proper, Hindi can be equated with the Central Zone Indic languages. These are conventionally divided into Western Hindi and Eastern Hindi. An even narrower definition of Hindi is that of the official language, Modern Standard Hindi or "Manak Hindi", a standardised register of Hindustani, one of the varieties of Western Hindi. Standardised Hindustani—including both Manak Hindi and Urdu—is historically based on the Khariboli dialect of 17th-century Delhi. Population data from the 16th (2009) edition of "Ethnologue" is as follows, counting languages with two million or more speakers: According to the 2001 Indian census, 258 million people in India (25% of\nDocument [54] (Title: C. N. Annadurai) Parliament that English would continue to be the official language of India, as long as non-Hindi speaking people desire. DMK gave up the plan of showing black flags and Annadurai appealed to the Union Government to bring about a constitutional amendment incorporating the assurance. With no constitutional amendment done, Annadurai declared 26 January 1965, the 15th Republic Day of India and also the day the Constitution, which in essence enshrined Hindi as the official language of India, came into practice, as a day of mourning. This move was opposed by the then Chief Minister of Madras State, Bhakthavatchalam, as blasphemous.\nDocument [55] (Title: Hindi in Bihar) and "Chaitanya Chandrika" by Krishna Chaitanya Goswami, a literary figures of that time. This literary activities were not confined to Patna alone but to many districts of Bihar. Despite the large number of speakers of Bihari languages, these languages have not been constitutionally recognised in India. Hindi is the language used for educational and official matters in Bihar, with English occasionally used in education and government. These languages was legally absorbed under the subordinate label of HINDI in the 1961 Census. Such state and national politics are creating conditions for language endangerments. The first success for spreading Hindi occurred in\nDocument [56] (Title: Hindi Divas) lobbied pan-India in favor of Hindi, most notable person Beohar Rajendra Simha along with Hazari Prasad Dwivedi, Kaka Kalelkar, Maithili Sharan Gupt and Seth Govind Das who even debated in Parliament on this issue. As such, on the 50th birthday of Beohar Rajendra Simha on 14 September 1949, the efforts came to fruition following adoption of Hindi as the official language. This decision was ratified by the Constitution of India that came into effect on 26 January 1950. Under the Article 343 of the Indian Constitution, Hindi written in Devanagari script was adopted as the official language. In all, there\nDocument [57] (Title: Languages with official status in India) being carried out in Hindi. A Parliament Committee on Official Language constituted in 1976 periodically reviews the progress in the use of Hindi and submits a report to the President. The governmental body which makes policy decisions and established guidelines for promotion of Hindi is the "Kendriya Hindi Samiti" (est. 1967). In every city that has more than ten central Government offices, a Town Official Language Implementation Committee is established and cash awards are given to government employees who write books in Hindi. All Central government offices and PSUs are to establish Hindi Cells for implementation of Hindi in their\nDocument [58] (Title: S. Natarajan) conference, Annadurai expressed his dissatisfaction that the stamp contained Hindi when it was for Tamil. The Anti-Hindi agitations of 1965 forced the central government to abandon its efforts to impose Hindi as the only official language of the country; still, Hindi imposition continued as Indian government employees are asked to write as much as 65% of the letters and memoranda in Hindi. In 1967, DMK came to power in Madras province 18 years after its formation and 10 years after it had first entered electoral politics. This began the Dravidian era in Madras province which later became Tamil Nadu. In\nDocument [59] (Title: Anti-Hindi agitations of Tamil Nadu) National Language of India was further divided into two camps: the Hindi faction comprising Tandon, Ravi Shankar Shukla, Govind Das, Sampurnanand, and K. M. Munshi; and the Hindustani faction represented by Jawaharlal Nehru and Abul Kalam Azad. The adoption of Hindi as the national language was opposed by certain Constituent Assembly members from South India like T.T. Krishnamachari, G. Durgabai, T. A. Ramalingam Chettiar, N. G. Ranga, N. Gopalaswamy Ayyangar (all belonging to "Madras") and S. V. Krishnamurthy Rao (Mysore). This anti-Hindi block favoured "retaining" English as the "official" language. Their views were reflected in the following pronouncement of Krishnamachari:\nDocument [60] (Title: Part XVII of the Constitution of India) the enrichment and development of the language. Part XVII of the Constitution of India Part XVII is a compilation of laws pertaining to the constitution of India as a country and the union of states that it is made of. This part of the constitution consists of Articles on Official Language. Chapter 1 covers articles 343 and 344, and settles Hindi written in Devanagari script as the official language of India. It also writes that the international form of the indian numerals should be used as the official numeral system. It also places a responsibility on the President of India\nDocument [61] (Title: Anti-Hindi agitations of Tamil Nadu) as the medium of instruction in all colleges and as the "language of administration" within five years, the Central Government was urged to end the special status accorded to Hindi in the Constitution and "treat all languages equally", and was urged to provide financial assistance for development of all languages mentioned in the Eighth Schedule of the Constitution. These measures satisfied the agitators and "normalcy" returned by February 1968. In 1986, Indian Prime minister Rajiv Gandhi introduced the "National Education Policy". This education policy provided for setting up Navodaya Schools, where the DMK claimed teaching of Hindi would be compulsory.\nDocument [62] (Title: C. N. Annadurai) in Kanchipuram on 27 February 1938. Two members of the protest, Thalamuthu and Natarajan, died as a consequence of police beating the same year. With overwhelming opposition, the government of Madras Presidency finally withdrew the order in 1940. When India became a republic with its own constitution in 1950, the constitution had given special status to the Hindi language, which was to gain official status after 15 years in 1965. This move was regarded with anxiety by students in Tamil Nadu. Speaking of making Hindi as official language of India, Annadurai said "It is claimed that Hindi should be the\nDocument [63] (Title: Languages with official status in India) a grievance to a government officer or authority has a constitutional right to submit it in any language used in India. Various steps have been taken by the Indian government to implement the use and familiarisation of Hindi extensively. Dakshina Bharat Hindi Prachar Sabha headquartered at Chennai was formed to spread Hindi in South Indian states. Regional Hindi implementation offices at Bengaluru, Thiruvananthapuram, Mumbai, Kolkata, Guwahati, Bhopal, Delhi and Ghaziabad have been established to monitor the implementation of Hindi in Central government offices and PSUs. Annual targets are set by the Department of Official Language regarding the amount of correspondence\nDocument [64] (Title: C. Rajagopalachari) to non-Hindi speaking people as English [is] to the protagonists of Hindi". When the Anti-Hindi agitations broke out in 1965, Rajagopalachari completely reversed his 1938 support for the introduction of Hindi and took a strongly anti-Hindi stand in support of the protests, On 17 January 1965, he convened the Madras state Anti-Hindi conference in Tiruchirapalli. angrily declaring that Part XVII of the Constitution of India which declared that Hindi was the official language should "be heaved and thrown into the Arabian Sea." The fourth elections to the Madras Legislative assembly were held in February 1967. At the age of 88,\nDocument [65] (Title: Languages of India) a dialect of Hindi, the Hindi belt. Hindi (or Hindustani) is the native language of most people living in Delhi, Uttar Pradesh, Uttarakhand, Chhattisgarh, Himachal Pradesh, Chandigarh, Bihar, Jharkhand, Madhya Pradesh, Haryana, and Rajasthan. "Modern Standard Hindi", a standardised language is the official language of the Union of India. In addition, it is one of only two languages used for business in Parliament however the Rajya Sabha now allows all 22 official languages on the Eighth Schedule to be spoken. Hindustani, evolved from "khari boli" (खड़ी बोली), a prominent tongue of Mughal times, which itself evolved from Apabhraṃśa, an intermediary\nDocument [66] (Title: Languages with official status in India) official purposes was to cease 15 years after the constitution came into effect, i.e., on 26 January 1965. The prospect of the changeover, however, led to much alarm in the non Hindi-speaking areas of India, especially Dravidian-speaking states whose languages were not related to Hindi at all. As a result, Parliament enacted the , In late 1964, an attempt was made to expressly provide for an end to the use of English, but it was met with protests from states such as Maharashtra, Tamil Nadu, Punjab, West Bengal, Karnataka, Puducherry and Andhra Pradesh. Some of these protests also turned violent.\nDocument [67] (Title: Anti-Hindi agitations of Tamil Nadu) jobs, civil service examinations and the fear that English would be replaced with Hindi as medium of instruction brought students into the anti-Hindi imposition agitation camp in large numbers. On 7 March 1964, the chief minister of Madras State, M. Bhaktavatsalam at a session of the Madras Legislative Assembly recommended the introduction of Three-language formula (English, Hindi and Tamil) in the state. Apprehension over the Three-language formula increased student support for the anti-Hindi cause. As 26 January 1965 approached, the anti-Hindi imposition agitation in Madras State grew in numbers and urgency. The Tamil Nadu Students Anti Hindi Agitation Council was\nDocument [68] (Title: Linguistic history of India) Khariboli with the name \'Gora Badal ki Katha\'. The establishment of British rule in the subcontinent saw the clear division of Hindi and Urdu registers. This period also saw the rise of modern Hindi literature starting with Bharatendu Harishchandra. This period also shows further Sanskritization of the Hindi language in literature. Hindi is right now the official language in nine states of India— Uttar Pradesh, Bihar, Rajasthan, Jharkhand, Madhya Pradesh, Chhattisgarh, Uttarakhand, Haryana and Himachal Pradesh—and the National Capital Territory of Delhi. Post-independence Hindi became the official language of the Central Government of India along with English. Urdu has been\nDocument [69] (Title: Urdu movement) of Urdu\'s counterpart Hindi as the Indian union\'s official language since 1950, the higher number of Hindi-speakers in India leading to the suppression of Urdu, the partition of India and Urdu\'s national language status in Pakistan overshadowing the language\'s prospects in India, as well as the lack of many schools in India teaching Urdu as a medium of instruction. Urdu movement The Urdu movement was a socio-political movement aimed at making Urdu the universal language and symbol of the cultural and political identity of the Muslim communities of the South Asia during the British Raj. The movement began with the\nDocument [70] (Title: Sivakant Tiwari) study of Hindi, and met the Minister for Education Dr. Tony Tan Keng Yam to request that Hindi be allowed to be learned as a second language by students. On 6 October 1989, the Minister announced that Hindi – along with Bengali, Gujarati, Punjabi and Urdu – would be offered as a second language in secondary schools up to the O-level standard. While the Ministry would provide premises for the lessons, students had to make their own arrangements for teachers. The "pro tem" committee arranged for the first Hindi classes for secondary school students on 21 January 1990 and, after\nDocument [71] (Title: Languages with official status in India) purposes of each state and union territory in the country, and the languages that are to be used for communication between the union and the states "inter se". During the British Raj, English was used for purposes at the federal level. The Indian constitution adopted in 1950 envisaged that Hindi would be gradually phased in to replace English over a fifteen-year period, but gave Parliament the power to, by law, provide for the continued use of English even thereafter. Plans to make Hindi the sole official language of the Republic met with resistance in some parts of the country. Hindi\nDocument [72] (Title: Hindi–Urdu controversy) Hindi and Urdu started to diverge linguistically, with Hindi drawing on Sanskrit as the primary source for formal and academic vocabulary, often with a conscious attempt to purge the language of Persian-derived equivalents. Deploring this Hindu-Muslim divide, Gandhi proposed re-merging the standards, using either Devanagari or Urdu script, under the traditional generic term Hindustani. Bolstered by the support of the Indian National Congress and various leaders involved in the Indian Independence Movement, Hindi, in the Devanagari script, along with English, replaced Urdu as the official language of India during the institution of the Indian constitution in 1950. The main cause\nDocument [73] (Title: Hindustani language) the majority of the population, Urdu is the "lingua franca". "Hindustani" was the official language of the British Raj and was synonymous with both Hindi and Urdu. After India\'s independence in 1947, the Sub-Committee on Fundamental Rights recommended that the official language of India be Hindustani: "Hindustani, written either in Devanagari or the Perso-Arabic script at the option of the citizen, shall, as the national language, be the first official language of the Union." However, this recommendation was not adopted by the Constituent Assembly. Besides being the "lingua franca" of North India and Pakistan in South Asia, Hindustani is also\nDocument [74] (Title: I. Panduranga Rao) working as a mathematics teacher and Hindi lecturer from 1950 to 1957 at Chittoor, Nellore, Guntur and Rajahmundry, Rao joined government service in 1957. He worked with the State Government of Andhra Pradesh as Hindi Education Officer (1957–60), and with the Government of India as the Regional Officer, Eastern Region (covering the eastern and north eastern states of the country with headquarters at Kolkata) under the Central Government Hindi Teaching Scheme (1960–64), both newly created positions to popularize India’s national language involving curriculum design, preparation of textbooks and professional development of teachers. Around 1964, there was a national debate on\nDocument [75] (Title: Uttar Pradesh) Dwivedi, Acharya Kuber Nath Rai, Bharatendu Harishchandra, Kamleshwar Prasad Saxena, Shivmangal Singh Suman, Mahadevi Varma, and Vibhuti Narain Rai. The state is sometimes called the \'Hindi heartland of India\'. Hindi became the language of state administration with the Uttar Pradesh Official Language Act of 1951. A 1989 amendment to the act added Urdu, as an additional language of the state. Linguistically, the state spreads across the Central, East-Central, and Eastern zones of the Hindi Belt, the major Hindi dialects of the state being Awadhi, Bhojpuri, Bundeli, Braj Bhasha, Kannauji and the vernacular form of Khariboli. Uttar Pradesh has produced musicians,\nDocument [76] (Title: John Gilchrist (linguist)) as the administrative language in a growing colonial bureaucracy. In 1803, Gilchrist inducted other writers into the college, who helped make rapid strides in Hindi language and literature. Subsequently, a Hindi translation of the Bible appeared in 1818 and "Udant Martand", the first Hindi newspaper, was published in 1826 in Calcutta. Scholars debate Gilchrist\'s role in the distillation of Hindustani into the modern languages of Hindi and Urdu, but according to Gilchrist, the rise of the new prose tradition was also the "bifurcation of Khariboli into two forms – the Hindustani language with Khariboli as the root resulted in two\nDocument [77] (Title: Anti-Hindi agitations of Tamil Nadu) Provinces), Babunath Gupta (Bihar), Hari Vinayak Pataskar (Bombay) and Ravi Shankar Shukla, Seth Govind Das (Central Provinces and Berar). They moved a large number of pro-Hindi amendments and argued for adopting Hindi as the sole National Language. On 10 December 1946, Dhulekar declared "People who do not know Hindustani have no right to stay in India. People who are present in the House to fashion a constitution for India and do not know Hindustani are not worthy to be members of this assembly. They had better leave." The members of the Constituent Assembly in favor of recognizing Hindi as the\nDocument [78] (Title: S. Natarajan) black flags to leading functionaries, to be shown to the President of India during his visit to the state. Sensing an uprising, Prime Minister Jawaharlal Nehru assured in the Parliament that English would continue to be the official language of India, as long as non-Hindi speaking people desire. DMK gave up the plan of showing black flags and Annadurai appealed to the Union Government to bring about a constitutional amendment incorporating the assurance. With no constitutional amendment done, Annadurai declared 26 January 1965, the 15th Republic Day of India and also the day the Constitution, which in essence enshrined Hindi\nDocument [79] (Title: Anti-Hindi agitations of Tamil Nadu) DMK convened an anti-Hindi Conference to protest against the imposition of Hindi. It observed 13 October 1957 as "anti-Hindi Day". On 31 July 1960, another open air anti-Hindi conference was held at Kodambakkam, Madras. In November 1963, DMK dropped its secessionist demand in the wake of the Sino-Indian War and the passage of the anti-secessionist 16th Amendment to the Indian Constitution. But the anti-Hindi stance remained and hardened with the passage of Official Languages Act of 1963. The DMK\'s view on Hindi\'s qualifications for official language status were reflected in Annadurai\'s response to the "numerical superiority of Hindi" argument: "If\nDocument [80] (Title: Languages of India) as a primary language and language of instruction, and as a second tongue in most states. British colonial legacy has resulted in English being a language for government, business and education. English, along with Hindi, is one of the two languages permitted in the Constitution of India for business in Parliament. Despite the fact that Hindi has official Government patronage and serves as a "lingua franca" over large parts of India, there was considerable opposition to the use of Hindi in the southern states of India, and English has emerged as a "de facto" "lingua franca" over much of India.\nDocument [81] (Title: Hindi Divas) Hindi Divas Hindi Divas (; ) is an annual day celebrated on 14 September in India to celebrate the popularity of Hindi as an official language of India. Mostly this celebration is a government sponsored event in Central government, but many private organizations and voluntary groups also celebrate the day. Hindi Divas is celebrated on 10 January because on this day in 1949, the Constituent Assembly of India had adopted Hindi written in Devanagari script as the official language of the Republic of India. But this didn\'t come as easily as it sounds. To this end, several stalwarts rallied and\nDocument [82] (Title: Jawaharlal Nehru) would continue to be used as the official language as long the non-Hindi speaking states wanted. The Official Languages Act was eventually amended in 1967 by the Congress Government headed by Indira Gandhi to guarantee the indefinite use of Hindi and English as official languages. This effectively ensured the current "virtual indefinite policy of bilingualism" of the Indian Republic. Nehru led newly independent India from 1947 to 1964, during its first years of independence from British rule. Both the United States and the Soviet Union competed to make India an ally throughout the Cold War. Nehru also maintained good relations\nDocument [83] (Title: World Hindi Secretariat) Sharma, Secretary (PCD), Government of India. Proclamation of World Hindi Secretariat Act (2005) The Proclamation of the World Hindi Secretariat Act passed in Mauritius National Assembly in November 2002 was announced on 12 September 2005. Official Hindi Secretariat Official Workshop (2008) On 11 February 2008, the World Hindi Secretariat officially started functioning. The functioning was led by the Secretary General Dr. Mrs Vinod Bala Arun and the Deputy General Secretary Dr. Rajendra Prasad Mishra. In 2012, a proposal was passed on the concluding day of the Ninth World Hindi Conference in Johannesburg to make Hindi one of the official languages\nDocument [84] (Title: Hindi–Urdu controversy) jobs, which eventually took on a communal form. In 1867, some Hindus in the United Provinces of Agra and Oudh during the British Raj in India began to demand that Hindi be made an official language in place of Urdu. Babu Shiva Prasad of Banares was one of the early proponents of the Nagari script. In a "Memorandum on court characters" written in 1868, he accused the early Muslim rulers of India for forcing them to learn Persian. In 1897, Madan Mohan Malaviya published a collection of documents and statements titled "Court character and primary education in North Western Provinces\nDocument [85] (Title: Linguistic rights) comprehensive school education in each language, in municipalities with both Finnish- and Swedish-speaking children, as long as there is a minimum of 13 students from the language community of that mother tongue. The constitution of India was first drafted on January 26, 1950. It is estimated that there are about 1500 languages in India. Article 343–345 declared that the official languages of India for communication with centre will be Hindi and English. There are 22 official languages identified by constitution. Article 345 states that "the Legislature of a state may by law adopt any one or more of the languages\nDocument [86] (Title: Hindi Divas) changed to "Rajbhasha Gaurav Puraskar". Hindi Divas Hindi Divas (; ) is an annual day celebrated on 14 September in India to celebrate the popularity of Hindi as an official language of India. Mostly this celebration is a government sponsored event in Central government, but many private organizations and voluntary groups also celebrate the day. Hindi Divas is celebrated on 10 January because on this day in 1949, the Constituent Assembly of India had adopted Hindi written in Devanagari script as the official language of the Republic of India. But this didn\'t come as easily as it sounds. To this\nDocument [87] (Title: Ghulam Rabbani Agro) into his language around AD 882. The language is called ‘Hindi’ by Arab historians (in this case the author of Ajaib ul Hind) who often failed to distinguish between the different languages of India and put them all under 22 Rahman the generic name of ‘Hindi.’ However, Syed Salman Nadwi, who calls this the first translation of the Quran into any Indian language suggests that this language might be Sindhi. Later, between 1020-1030 Al-Beruni visited India and wrote a book on it called "Kitab Ma-li al Hind " which was translated by Edward C Sachau as " Alberuni’s India" (1888).\nDocument [88] (Title: C. Rajagopalachari) military force against Portugal to capture the Portuguese enclave of Goa was criticised by Rajagopalachari who said of the operation and subsequent acts of international diplomacy, "India has totally lost the moral power to raise her voice against the use of military power." On 26 January 1950, the Government of India adopted Hindi as the official language of the country, but because of objections in non-Hindi-speaking areas, it introduced a provision tentatively making English the second official language on a par with Hindi for a stipulated fifteen-year period to facilitate a switch to Hindi in non-Hindi speaking states. From 26\nDocument [89] (Title: Hindi media) Bengal Gazette"), started in January 1780, and the first Hindi daily, "Samachar Sudha Varshan", started in 1854 - three years before the Indian Rebellion of 1857. On 30 May 1826, "Udant Martand" (The Rising Sun), the first Hindi-language newspaper published in India, started from Calcutta (now Kolkata), published every Tuesday by Pt. Jugal Kishore Shukla. Hindi news media has a dominant presence in a large part of the country. India is a multi-lingual country, and the Hindi belt is a group of states which are predominantly Hindi-speaking. The Hindi belt is commonly understood to include states of Delhi, Punjab, Haryana,\nDocument [90] (Title: Anti-Hindi agitations of Tamil Nadu) riots subsided after Shastri\'s assurance, as did the student agitation. The agitations of 1965 led to major political changes in the state. The DMK won the 1967 assembly election and the Congress Party never managed to recapture power in the state since then. The Official Languages Act was eventually amended in 1967 by the Congress Government headed by Indira Gandhi to guarantee the indefinite use of Hindi and English as official languages. This effectively ensured the current "virtual indefinite policy of bilingualism" of the Indian Republic. There were also two similar (but smaller) agitations in 1968 and 1986 which had\nDocument [91] (Title: Anti-Hindi agitations of Tamil Nadu) end of 1949. It was called the Munshi-Ayyangar formula (after K.M. Munshi and Gopalaswamy Ayyangar) and it struck a balance between the demands of all groups. of the Indian Constitution was drafted according to this compromise. It did not have any mention of a "National Language". Instead, it defined only the "Official Languages" of the Union: Hindi in Devanagari script would be the official language of the Indian Union. For fifteen years, English would also be used for all official purposes (Article 343). A language commission could be convened after five years to recommend ways to promote Hindi as the\nDocument [92] (Title: Hindi–Urdu controversy) independence movement paved its status as an alternative official language of independent India. Hindi was supported by religious and political leaders, social reformers, writers and intellectuals during independence movement securing that status. Hindi, along with English, was recognised as the official language of India during the institution of the Indian constitution in 1950. "Indo-Persian Literature and Amir Khusrau, University of Delhi - PDF Newspaper Hindi–Urdu controversy The Hindi–Urdu controversy is an ongoing dispute—dating back to the 19th century—regarding the status of Hindi and Urdu as a single language, Hindustani (lit "of "Hindustan""), or as two dialects of a single language,\nDocument [93] (Title: Jawaharlal Nehru) peoples of the scheduled castes and scheduled tribes. Nehru also championed secularism and religious harmony, increasing the representation of minorities in government. Nehru led the faction of the Congress party which promoted Hindi as the lingua-franca of the Indian nation. After an exhaustive and divisive debate with the non-Hindi speakers, Hindi was adopted as the official language of India in 1950 with English continuing as an associate official language for a period of fifteen years, after which Hindi would become the sole official language. Efforts by the Indian Government to make Hindi the sole official language after 1965 were not\nDocument [94] (Title: S. Natarajan) regarded with anxiety by students in Tamil Nadu. Speaking of making Hindi as an official language of India, Annadurai said "It is claimed that Hindi should be the common language because it is spoken by the majority. Why should we then claim the tiger as our national animal instead of the rat which is so much more numerous? Or the peacock as our national bird when the crow is ubiquitous?". In view of the continued threat to impose Hindi, the DMK held an open-air conference against Hindi imposition at Kodambakkam, Chennai in August 1960, which Annadurai presided over. He gave\nDocument [95] (Title: Anti-Hindi agitation of 1937–40) as a common language to unite various linguistic groups against the British Government. As early as 1918, Mahatma Gandhi established the "Dakshin Bharat Hindi Prachar Sabha" (Institution for the Propagation of Hindi in South India). In 1925, the Indian National Congress switched to Hindustani from English for conducting its proceedings. Both Gandhi and Jawaharlal Nehru were supporters of Hindustani and Congress wanted to propagate the learning of Hindustani in non-Hindi speaking Provinces of India. The idea of making Hindustani or Hindi the common language was not acceptable to Periyar, who viewed it as an attempt to make Tamils subordinate to\nDocument [96] (Title: Hindi in Bihar) Bihar in 1881, when Hindi displaced Urdu as the sole official language of the province. In this struggle between competing Hindi and Urdu, the potential claims of the three large mother tongues in the region - Magahi, Bhojpuri and Maithili were ignored. After independence Hindi was again given the sole official status through the Bihar Official Language Act, 1950. Urdu became the second official language in the undivided State of Bihar on 16 August 1989. Maithili has been the only one among the Bihari languages, which has been trying to constantly deny superimposition of Hindi over its identity. The other\nDocument [97] (Title: Jawaharlal Nehru) acceptable to many non-Hindi Indian states, who wanted the continued use of English. The Dravida Munnetra Kazhagam (DMK), a descendant of Dravidar Kazhagam, led the opposition to Hindi. To allay their fears, Nehru enacted the Official Languages Act in 1963 to ensure the continuing use of English beyond 1965. The text of the Act did not satisfy the DMK and increased their scepticism that his assurances might not be honoured by future administrations. The issue was resolved during the premiership of Lal Bahadur Shastri, who under great pressure from Nehru\'s daughter, Indira Gandhi, was made to give assurances that English\nDocument [98] (Title: Languages of India) by the members from those parts of India where Hindi was not spoken natively. Eventually, a compromise was reached with Hindi in Devanagari script to be the official language of the union but for "fifteen years from the commencement of the Constitution, the English Language shall continue to be used for all the official purposes of the Union for which it was being used immediately before such commencement". Article 343 (1) of the Constitution of India states "The Official Language of the Union government shall be Hindi in Devanagari script." Unless Parliament decided otherwise, the use of English for official\nDocument [99] (Title: Ravishankar Shukla) as the national language. In 1916 was held the seventh session of Antar Bhartiya Sammelan of Hindi at Jabalpur, in which Pt. Shukla participated. After its success, he launched Madhya Pradesh Hindi Sahitya Sammelan and held its first session at Raipur in 1918. In the fifth session of the Sahitya Sammelan, held at Nagpur in March 1922, he for the first time proposed to incorporate Hindi instead of English as Raj Bhasha. A British Minister, Sir Montegue was sent to India in 1917 to assess the situation of unrest arising in India after the first world-war. To express the public\nDocument [100] (Title: Hindi) a standardised form of Hindustani separate from Urdu took form. In 1881, Bihar accepted Hindi as its sole official language, replacing Urdu, and thus became the first state of India to adopt Hindi. Modern Standard Hindi is one of the youngest Indian languages in this regard. After independence, the government of India instituted the following conventions: On 14 September 1949, the Constituent Assembly of India adopted Hindi written in the Devanagari script as the official language of the Republic of India replacing Urdu\'s previous usage in British India. To this end, several stalwarts rallied and lobbied pan-India in favor of\nDocument [101] (Title: Magahi language) setback to local languages as its Persian script was alien to local people. The first success for spreading Hindi occurred in Bihar in 1881, when Hindi displaced Urdu as the official language of the province. After independence, Hindi was given the sole official status through the Bihar Official Language Act, 1950. There are several dialects of Magadhi. It is spoken in the area which formed the core of the ancient kingdom of Magadha - the modern districts of Patna, Nalanda, Gaya, Jehanabad, Arwal, Aurangabad, Lakhisarai, Sheikhpura and Nawada and Munger. Magahi is bounded on the north by the various forms\nDocument [102] (Title: Anti-Hindi agitations of Tamil Nadu) Sen, Satyanarayan Sinha, Mahavir Tyagi, M. C. Chagla and S. K. Patil and but no southern members debated the issue and could not come to any agreement. The sub-committee recommended the continuation of English and Hindi as joint link languages and was not in favour of either quota system or use of regional languages in public services exams. They drafted an amendment to Official Languages Act incorporating Nehru\'s assurances explicitly. This Bill guaranteeing the use of English in inter-state and state-Union communications as long as desired by Non-Hindi states was approved for discussion by the Speaker on 25 August. But\nDocument [103] (Title: Anti-Hindi agitations of Tamil Nadu) Article 343, Nehru appointed the First Official Language Commission under the chairmanship of B. G. Kher on 7 June 1955. The commission delivered its report on 31 July 1956. It recommended a number of steps to eventually replace English with Hindi (the report registered "dissenting notes" from two members – P. Subbarayan from Madras State and Suniti Kumar Chatterji from West Bengal). The Parliamentary Committee on Official Language, chaired by Govind Ballabh Pant was constituted in September 1957 to review the Kher commission report. After two years of deliberations, the Pant Committee submitted its recommendations to the President on 8\nDocument [104] (Title: Bihari languages) Such state and national politics are creating conditions for language endangerments. After independence Hindi was given the sole official status through the Bihar Official Language Act, 1950. Hindi was displaced as the sole official language of Bihar in 1981, when Urdu was accorded the status of the second official language. The number of speakers of Bihari languages is difficult to indicate because of unreliable sources. In the urban region most educated speakers of the language name Hindi as their language because this is what they use in formal contexts and believe it to be the appropriate response because of unawareness.\nDocument [105] (Title: Twenty-first Amendment of the Constitution of India) Twenty-first Amendment of the Constitution of India The Twenty-first Amendment of the Constitution of India, officially known as The Constitution (Twenty-first Amendment) Act, 1967, amended the Eighth Schedule to the Constitution so as to include Sindhi as one of the languages, thereby raising the total number of languages listed in the schedule to fifteen. The Eighth Schedule lists languages that the Government of India has the responsibility to develop. The Eighth Schedule to the Constitution originally included 14 languages. The 71st Amendment, enacted in 1992, included three more languages, i.e. Konkani, Meitei (Manipuri) and Nepali. The 92nd Amendment, added Bodo,\nDocument [106] (Title: Lingua franca) language." The order in non-Hindi speaking states is: "(a) the regional language; (b) Hindi; (c) Urdu or any other modern Indian language excluding (a) and (b); and (d) English or any other modern European language." Hindi has also emerged as a lingua franca for the locals of Arunachal Pradesh, a linguistically diverse state in Northeast India.It is estimated that 90 percent of the state\'s population knows Hindi. The only documented sign language used as a lingua franca is Plains Indian Sign Language, used across much of North America. It was used as a second language across many indigenous peoples. Alongside\nDocument [107] (Title: Anti-Hindi agitations of Tamil Nadu) Hindi education to government employees and other efforts for propagating Hindi. To give legal status to Nehru\'s assurance of 1959, the was passed in 1963. In Nehru\'s own words: This is a Bill, in continuation of what has happened in the past, to remove a restriction which had been placed by the Constitution on the use of English after a certain date i.e. 1965. It is just to remove that restriction that this is placed. The Bill was introduced in Parliament on 21 January 1963. Opposition to the Bill came from DMK members who objected to the usage of the\nDocument [108] (Title: Eighth Schedule to the Constitution of India) Eighth Schedule to the Constitution of India The Eighth Schedule to the Constitution of India lists the official languages of the Republic of India. At the time when the Constitution was enacted, inclusion in this list meant that the language was entitled to representation on the Official Languages Commission, and that the language would be one of the bases that would be drawn upon to enrich Hindi, the official language of the Union. The list has since, however, acquired further significance. The Government of India is now under an obligation to take measures for the development of these languages, such\nDocument [109] (Title: Hindi) the Hindi prose literature was Munshi Premchand, who is considered as the most revered figure in the world of Hindi fiction and progressive movement. Literary, or "Sāhityik", Hindi was popularised by the writings of Swami Dayananda Saraswati, Bhartendu Harishchandra and others. The rising numbers of newspapers and magazines made Hindustani popular with the educated people. The "Dvivedī Yug" ("Age of Dwivedi") in Hindi literature lasted from 1900 to 1918. It is named after Mahavir Prasad Dwivedi, who played a major role in establishing Modern Standard Hindi in poetry and broadening the acceptable subjects of Hindi poetry from the traditional ones\nDocument [110] (Title: Hindi–Urdu controversy) Hindi–Urdu controversy The Hindi–Urdu controversy is an ongoing dispute—dating back to the 19th century—regarding the status of Hindi and Urdu as a single language, Hindustani (lit "of "Hindustan""), or as two dialects of a single language, and the establishment of a single standard language in certain areas of North India. Although this debate was officially settled by a government order in 1950, declaring Hindi as the official language, some resistance remains. The present notion among some Muslims about this dispute is that Hindus abandoned the Urdu language, whereas some Hindus claim that Urdu was artificially created during Muslim rule. Hindi\nDocument [111] (Title: Hindi–Urdu controversy) Perso-Arabic scripts. This decree evoked protests from Urdu supporters and joy from Hindi supporters. However, the order was more symbolic in that it did not provision exclusive use of Nagari script. Perso-Arabic remained dominant in North-Western provinces and Oudh as the preferred writing system until independence. C. Rajagopalachari, chief minister of Madras Presidency introduced Hindi as a compulsory language in secondary school education though he later relented and opposed the introduction of Hindi during the Madras anti-Hindi agitation of 1965. Bal Gangadhar Tilak supported Devanagari script as the essential part of nationalist movement. The language policy of Congress and the\nDocument [112] (Title: Languages of India) must for officials. Hindustani was made the second language of British Indian Empire after English and considered as the language of administration. The British introduced the use of Roman script for Hindustani as well as other languages. Urdu had 70 million speakers in India (as per the Census of 2001), and, along with Hindi, is one of the 22 officially recognised regional languages of India and also an official language in the Indian states of Jammu and Kashmir, Delhi, Uttar Pradesh, Bihar and Telangana that have significant Muslim populations. Gujarati is an Indo-Aryan language. It is native to the west\nDocument [113] (Title: Hindi) of Hindi on non-native speakers, especially in South India (such as the those in Tamil Nadu) led to the passage of the Official Languages Act of 1963, which provided for the continued use of English indefinitely for all official purposes, although the constitutional directive for the Union Government to encourage the spread of Hindi was retained and has strongly influenced its policies. At the state level, Hindi is the official language of the following Indian states: Bihar, Chhattisgarh, Haryana, Himachal Pradesh, Jharkhand, Madhya Pradesh, Mizoram, Rajasthan, Uttar Pradesh, Uttarakhand and West Bengal. Each may also designate a "co-official language"; in\nDocument [114] (Title: C. N. Annadurai) in India by a committee headed by Motilal Nehru in 1928. This move was opposed by people and politicians of Tamil Nadu, since they considered that it would make them second class citizens when compared to that of native Hindi speaking North Indians. In 1938, the Congress government in Madras Presidency headed by C. Rajagopalachari (popularly known as Rajaji) proposed the use of Hindi language as a compulsory language in schools. This move was opposed by Tamil leaders. Annadurai, along with other Tamil enthusiasts including the poet Bharathidasan, held demonstrations. Annadurai participated in the first Anti Hindi imposition conference held\nDocument [115] (Title: Indian English) of India. Due to protests from Tamil Nadu and other non-Hindi-speaking states, it was decided to temporarily retain English for official purposes until at least 1965. By the end of this period, however, opposition from non-Hindi states was still too strong to have Hindi declared the sole language. With this in mind, the English Language Amendment Bill declared English to be an associate language "until such time as all non-Hindi States had agreed to its being dropped." This has not yet occurred, and it is still widely used. For instance, it is the only reliable means of day-to-day communication between\nDocument [116] (Title: Hindi Divas) are 22 Scheduled languages of India, of which, two are officially used at Union government of India level: Hindi and English. Hindi is spoken by 250 million people as the original language and it is the fourth language of the world. Apart from local-level events in schools and other institutions, a few of the notable events include Ministry of Home Affairs in its order dated 25 March 2015 has changed name of two awards given annually on Hindi Divas. \'Indira Gandhi Rajbhasha Puraskar\' instituted in 1986 changed to \'Rajbhasha Kirti Puraskar\' and \'Rajiv Gandhi Rashtriya Gyan-Vigyan Maulik Pustak Lekhan Puraskar\'\nDocument [117] (Title: Ravishankar Shukla) national language, for the pride of the country and the natural convenience it could bring to the child entering school vis a vis English as a medium of teaching. He always took the lead and constantly pursued the cause of Hindi as the national language. No one had realised it then that, the impact of the zest of Pandit Shukla for incorporating Hindi into the mainstream of the nation as a working language throughout educational, administrative and legislative wings, would in future see almost all government offices in the country using the language. The parliament of India then adopted Hindi\nDocument [118] (Title: History of Hindustani) them, and it was influenced there by more southerly languages, producing the Dakhini dialect. During this time Hindustani was the language of both Hindus and Muslims. The communal nature of the language lasted until the British Raj, when in 1837 Hindustani in the Persian script (i.e. Urdu) replaced Persian as the official language and was made co-official along with English. This triggered a Hindu backlash in northwestern India, which argued that the language should be written in the native Devanagari script. This literary standard, called simply "Hindi", replaced Hindustani/Urdu as the official register of Bihar in 1881, establishing a sectarian\nDocument [119] (Title: Moturi Satyanarayana) India\'s political history. However, some people criticise his decision to support Hindi, and not English, as the official language of India. The awkward provisions in the Indian Constitution and a later push by pro-Hindi hardliners led to "language riots" in the 1960s in southern states such as Tamil Nadu, where more than 60 people died. Several students immolated themselves protesting the forced use of Hindi as the national language. While English has become the de facto national language of India by 2010, the Indian Constitution has continued to serve as the fundamental basis of the India\'s polity and society and\nDocument [120] (Title: Languages with official status in India) the time the constitution was enacted, inclusion in this list meant that the language was entitled to representation on the Official Languages Commission, and that the language would be one of the bases that would be drawn upon to enrich Hindi, the official language of the Union. The list has since, however, acquired further significance. The Government of India is now under an obligation to take measures for the development of these languages, such that "they grow rapidly in richness and become effective means of communicating modern knowledge." In addition, a candidate appearing in an examination conducted for public service\nDocument [121] (Title: Hindi) Spanish and English. Alongside Urdu as Hindustani, it is the third most-spoken language in the world, after Mandarin and English. The term "Hindī" originally was used to refer to inhabitants of the region east of the Indus. It was borrowed from Classical Persian "Hindī" (Iranian Persian "Hendi"), meaning "Indian", from the proper noun "Hind" "India". The name "Hindavī" was used by Amir Khusrow in his poetry. Like other Indo-Aryan languages, Hindi is a direct descendant of an early form of Vedic Sanskrit, through Sauraseni Prakrit and Śauraseni Apabhraṃśa (from Sanskrit "apabhraṃśa" "corrupted"), which emerged in the 7th century A.D. Modern\nDocument [122] (Title: Linguistic history of India) Dhar (9th century CE), Pushpadanta of Manikhet (9th century CE), Dhanapal, Muni Ramsimha, Hemachandra of Patan, Raighu of Gwalior (15th century CE). An early example of the use of Apabhraṃśa is in Vikramūrvashīiya of Kalidasa, when Pururava asks the animals in the forest about his beloved who had disappeared. Hindustani is right now the most spoken language in the Indian subcontinent and the fourth most spoken language in the world. The development of Hindustani revolves around the various Hindi dialects originating mainly from Sauraseni Apabhramsha. A Jain text Shravakachar written in 933AD is considered the first Hindi book. Modern Hindi\nDocument [123] (Title: Languages of India) purposes was to cease 15 years after the constitution came into effect, i.e. on 26 January 1965. As the date for changeover approached, however, there was much alarm in the non Hindi-speaking areas of India, especially in Kerala, Gujarat, Maharashtra, Tamil Nadu, Punjab, West Bengal, Karnataka, Puducherry and Andhra Pradesh. Accordingly, Jawaharlal Nehru ensured the enactment of the , which provided that English "may" still be used with Hindi for official purposes, even after 1965. The wording of the text proved unfortunate in that while Nehru understood that "may" meant shall, politicians championing the cause of Hindi thought it implied\nDocument [124] (Title: Hindi–Urdu controversy) "Hindi" and "Urdu" protagonists each advocating the official use of Hindustani with the Devanagari script or with the Nastaʿlīq script, respectively. Hindi movements advocating the growth of and official status for Devanagari were established in Northern India. Babu Shiva Prasad and Madan Mohan Malaviya were notable early proponents of this movement. This, consequently, led to the development of Urdu movements defending Urdu\'s official status; Syed Ahmed Khan was one of its noted advocates. In 1900, the Government issued a decree granting symbolic equal status to both Hindi and Urdu, which was opposed by Muslims and received with jubilation by Hindus.\nDocument [125] (Title: Official languages of the United Nations) of the official languages of India and Fiji and is also spoken in Suriname, Mauritius, Trinidad and Tobago and Guyana. It is mutually intelligible to a high degree with Urdu which is spoken in Pakistan and together they are often considered the same language, referred to as Hindustani or Hindi-Urdu. Although very similar verbally, they do have different written scripts; Hindi is written in the Devanagari script and Urdu is written in the Nastaʿlīq script. Hindi has more than 550 million speakers in India alone, of whom 422 million are native, 98.2 million are second language speakers, and 31.2 million\nDocument [126] (Title: Languages with official status in India) exact extent to which, and the areas in which, the Union government uses Hindi and English, respectively, is determined by the provisions of the Constitution, the Official Languages Act, 1963, the Official Languages Rules, 1976, and statutory instruments made by the Department of Official Language under these laws. The Indian constitution draws a distinction between the language to be used in Parliamentary proceedings, and the language in which laws are to be made. Parliamentary business, according to the Constitution, may be conducted in either Hindi or English. The use of English in parliamentary proceedings was to be phased out at\nDocument [127] (Title: Languages with official status in India) As a result, the proposal was dropped, and the Act itself was amended in 1967 to provide that the use of English would not be ended until a resolution to that effect was passed by the legislature of every state that had not adopted Hindi as its official language, and by each house of the Indian Parliament. The position was thus that the Union government continues to use English in addition to Hindi for its official purposes as a "subsidiary official language," but is also required to prepare and execute a programme to progressively increase its use of Hindi. The\nDocument [128] (Title: Rise of Dravidian parties to power in Tamil Nadu) Hindi and English, which are members of three different language families and each is written in a different script. Therefore, a three-language formula proposed was seen as a great educational burden imposed on non-Hindi-speaking states. Nehru promised in 1959 that the interests of the non-Hindi speakers would be safeguarded, and so did Lal Bahadur Shastri later, but those promises didn\'t put the fears of non-Hindi speakers to rest. The support on opposition of Hindi as a national language by the education elite was well evident by the early 1960s where DMK, a champion of this cause, controlled corporations of all\nDocument [129] (Title: Anti-Hindi agitations of Tamil Nadu) we had to accept the principle of numerical superiority while selecting our national bird, the choice would have fallen not on the peacock but on the common crow." As the deadline stipulated in Part XVII of the Constitution for switching to Hindi as primary official language approached, the central government stepped up its efforts to spread Hindi\'s official usage. In 1960, compulsory training for Hindi typing and stenography was started. The same year, India\'s president Rajendra Prasad acted on the Pant Committee\'s recommendations and issued orders for preparation of Hindi glossaries, translating procedural literature and legal codes to Hindi, imparting\nDocument [130] (Title: Anti-Hindi agitations of Tamil Nadu) politicians like Jana Sangh\'s founder Syama Prasad Mookerjee, who demanded that Hindi should be made National language. Soon after the Constitution was adopted on 26 January 1950, efforts were made to propagate Hindi for official usage. In 1952, the Ministry of Education launched a voluntary Hindi teaching scheme. On 27 May 1952, use of Hindi was introduced in warrants for judicial appointments. In 1955, in-house Hindi training was started for all ministries and departments of the central government. On 3 December 1955, the government started using Hindi (along with English) for ""specific purposes of the Union"" As provided for by\nDocument [131] (Title: Three-language formula) Three-language formula The three-language formula for language learning was formulated in 1968 by the Ministry of Education of the Government of India in consultation with the states. The formula as enunciated in the 1968 National Policy Resolution which provided for the study of "Hindi, English and modern Indian language (preferably one of the southern languages) in the Hindi speaking states and Hindi, English and the Regional language in the non-Hindi speaking States". The formula was formulated in response to demands from non-Hindi speaking states of the South, such as Karnataka, Andhra Pradesh and mainly Tamil Nadu. Currently, the three language\nDocument [132] (Title: Durgabai Deshmukh) Hindustani (Hindi+Urdu) as the national language of India but also expressed fear about the forceful campaign for Hindi in South India. She proposed a period of fifteen years of status quo to enable all the non-Hindi speakers to adopt and learn Hindi. Born in Rajahmundry, Andhra Pradesh, British India, in the Gummidithala family; Durgabai was married at the age of 8 to her cousin, Subba Rao. She refused to live with him after her maturation, and her father and brother supported her decision. She later quit him to pursue her education. In 1953, she married the then Finance Minister of\nDocument [133] (Title: Uttar Pradesh Legislative Assembly) the Constitution, in their respective Houses. An important legislative measure passed in 1950 was the U. P. Language (Bills and Acts) Act, 1950, which provided that the language to be used in Bills and Acts shall be Hindi in Devanagri script. In 1951, the U. P. Official Language Act was passed adopting Hindi in Devanagri script as the language to be used for official purpose of the State. On 11 August 1950 the Speaker, Purshottam Das Tandon, resigned from his office. On 21 December 1950 the Deputy Speaker, Nafisul Hasan, was elected the Speaker. Hasan replied to the congratulatory speeches\nDocument [134] (Title: Anti-Hindi agitations of Tamil Nadu) Hindustani and Congress wanted to propagate the learning of Hindustani in non-Hindi speaking Provinces of India. The idea of making Hindustani or Hindi the common language, was not acceptable to Periyar, who viewed it as an attempt to make Tamils subordinate to North Indians. The Indian National Congress won the 1937 elections in Madras Presidency. Rajaji became the Chief Minister on 14 July 1937. He was a supporter of propagating Hindi in South India. On 11 August 1937, within a month of coming to power, he announced his intention to introduce Hindi language teaching in secondary schools by issuing a\nDocument [135] (Title: Anti-Hindi agitations of Tamil Nadu) the opposition to Hindi. To allay their fears, Prime Minister Jawaharlal Nehru enacted the Official Languages Act in 1963 to ensure the continuing use of English beyond 1965. The text of the Act did not satisfy the DMK and increased their skepticism that his assurances might not be honoured by future administrations. As the day (26 January 1965) of switching over to Hindi as sole official language approached, the anti-Hindi movement gained momentum in Madras State with increased support from college students. On 25 January, a full-scale riot broke out in the southern city of Madurai, sparked off by a\nDocument [136] (Title: Languages of India) bitterly contested subject in the proceedings of the Constituent Assembly of India, specifically what should be the language in which the Constitution of India is written and the language spoken during the proceedings of Parliament and thus deserving of the epithet "national". Members belonging to the northern parts of India insisted that the Constitution be drafted in Hindi with the unofficial translation in English. This was not agreed to by the drafting Committee on the grounds that English was much better to craft the nuanced prose on constitutional subjects. The efforts to make Hindi the pre-eminent language were bitterly resisted\nDocument [137] (Title: Anti-Hindi agitations of Tamil Nadu) February 1959. It recommended that Hindi should be made the primary official language with English as the subsidiary one. The Kher Commission and the Pant Committee recommendations were condemned and opposed by self-described "non-Hindi" politicians like Frank Anthony and P. Subbarayan. The Academy of Telugu opposed the switch from English to Hindi in a convention held in 1956. Rajaji, once a staunch supporter of Hindi, organised an All India Language Conference (attended by representatives of Tamil, Malayalam, Telugu, Assamese, Oriya, Marathi, Kannada and Bengali languages) on 8 March 1958 to oppose the switch, declaring [that] "Hindi is as much foreign\nDocument [138] (Title: Anti-Hindi agitation of 1937–40) why Hindi after all has been chosen as the common language of India". Financial support for the agitation was provided by industrialists and landlords like W. P. A. Soundarapandian Nadar and G. D. Naidu. Acknowledging the agitation\'s popular support, Lord Erskine, the then Governor of Madras wrote to Viceroy Linlithgow in July 1938 that "Compulsory Hindi has been the cause of great trouble in this province and is certainly contrary to the wishes of the bulk of the population..." On 1 May 1938, a young man named Stalin Jagadeesan went on a fast demanding the withdrawal of compulsory Hindi teaching.\n\nQuestion: when was hindi made national language of india?'}, {'from': 'gpt', 'value': 'Answer: 1950\nLong Answer: The Indian constitution, in 1950, declared Hindi in Devanagari script to be the official language of the union . Unless Parliament decided otherwise, the use of English for official purposes was to cease 15 years after the constitution came into effect, i.e., on 26 January 1965 . The prospect of the changeover, however, led to much alarm in the non Hindi - speaking areas of India, especially Dravidian - speaking states whose languages were not related to Hindi at all . As a result, Parliament enacted the Official Languages Act, 1963, which provided for the continued use of English for official purposes along with Hindi, even after 1965.\nGold Document ID: 133'}], [{'from': 'human', 'value': '<video>\nWhat actions do the people in the video perform?'}, {'from': 'gpt', 'value': 'The people in the video hug, kiss, dance, and take pictures using a cell phone.'}], [{'from': 'human', 'value': '<video>\nWhat is the ring in the video designed to look like?'}, {'from': 'gpt', 'value': 'The ring in the video appears to be designed in the shape of a treble clef, which is commonly associated with music notation.'}], [{'from': 'user', 'value': 'Can you provide a stepwise solution for evaluating 5973629751327+84285945784?'}, {'from': 'assistant', 'value': '5973629751327 + 84285945784 = 6057915697111'}], [{'from': 'human', 'value': '<image>\nPlease convert the visual information in the image into a detailed written explanation.'}, {'from': 'gpt', 'value': 'The image displays a vibrant neon sign that reads "Hoskins Restaurant Good Food." The sign is designed with a combination of blue, red, and white neon lights. The words "Hoskins" and "Good Food" are in blue, while "Restaurant" is in red. The sign has a cross shape with the longer vertical section formed by the word "Restaurant," and the horizontal section is split between "Hoskins" on the top and "Good Food" on the bottom. The neon sign appears to be photographed with a long exposure or a camera motion technique, as indicated by the light trails that streak horizontally across and behind the sign. These light trails are predominantly red, suggesting the presence of moving vehicles or other light sources during the capture of the image. The background is dark, emphasizing the luminosity and colors of the neon sign.'}], [{'from': 'human', 'value': '<image>\nyou are given some question and answer pairs as context, and you will answer the question at the end based on the image.\n\nIs there a large sphere that has the same color as the tiny cube? no\nDo the red cylinder and the green object have the same size? no\nWhat size is the cyan metal thing left of the large metallic sphere on the left side of the big green ball? small\nWhat size is the metal thing that is to the left of the gray ball and behind the cyan metallic thing? small\nHow many red things have the same size as the red metal cube? 1\nHow many rubber things are large gray balls or small cylinders? 0\nWhat size is the shiny object that is the same color as the tiny metallic block? small\nHow many objects are either green cylinders or big metallic things behind the green metal object? 1\n\nThere is a big object that is in front of the big shiny thing behind the large green object; what is its material?'}, {'from': 'gpt', 'value': 'metal'}], [{'from': 'human', 'value': '<video>\nWatch the provided video and offer an accurate response to the related question.\nAt what time of day do the trees appear to be blooming?'}, {'from': 'gpt', 'value': 'The trees in the video seem to be blooming during the time of bright sunset sunlight.'}], [{'from': 'human', 'value': '<video>\nWhere is this video taken?\nAnswer the question using few words or phrase.'}, {'from': 'gpt', 'value': 'bedroom.'}], [{'from': 'human', 'value': '<video>\nWhat happens after the car lands back on the ramp?'}, {'from': 'gpt', 'value': "Orange text appears, reading 'yesterday,' followed by 'that said.'"}, {'from': 'human', 'value': 'What is the background when the car is flipping in the air?'}, {'from': 'gpt', 'value': 'The background shows a vast blue sky and ocean.'}, {'from': 'human', 'value': 'Why does the car start to flip in the air?'}, {'from': 'gpt', 'value': 'The car starts to flip in the air as it approaches the large jump.'}, {'from': 'human', 'value': 'Where does the video take place?'}, {'from': 'gpt', 'value': 'The video takes place on a steep ramp with red and white striped edges, set against the backdrop of a vast blue sky and ocean.'}, {'from': 'human', 'value': 'What does the animated character at the end of the video look like?'}, {'from': 'gpt', 'value': 'The animated character resembles a yellow sponge wearing headphones and holding a game controller.'}]], 'images': [None, None, None, None, None, ['/data/data/LMM/FreedomIntelligence/ALLaVA-4V/allava_laion/images/40636855.jpeg'], ['datasets/LMM/lmms-lab/LLaVA-OneVision-Data/images/vision_flan(filtered)/15fc7e4a0d37f18204e3b1396b36e1f9.png'], None, None, None], 'source': ['datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl', 'datasets/LLM/jsons/togethercomputer/Long-Data-Collections/natural_questions_10_200_docs.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochat1.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/llava_hound/sharegptvideo_qa_255k_processed.jsonl', 'datasets/LLM/jsons/tiedong/goat/dataset.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Caption-LAION-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vision_flan(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vqa_webvid_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_oe_v0_1_qa_processed.jsonl'], 'videos': [None, None, ['datasets/LMM/MBZUAI/VideoGPT-plus_Training_Dataset/instruction_tuning/videochat_it/34750672.mp4'], ['datasets/LMM/ShareGPTVideo/train_video_and_instruction/train_300k/44oTaaLMvcI'], None, None, None, ['datasets/LMM/MBZUAI/VideoGPT-plus_Training_Dataset/instruction_tuning/webvid/1010188088.mp4'], ['datasets/LMM/lmms-lab/LLaVA-Video-178K/NextQA/NExTVideo/0057/3174267702.mp4'], ['datasets/LMM/lmms-lab/LLaVA-Video-178K/liwei_youtube_videos/videos/youtube_video_2024/ytb_dU592AZWdX4.mp4']]}
+> finished creating GPT datasets ...
+[after dataloaders are built] datetime: 2024-11-27 12:50:16 
+done with setup ...
+training ...
+[before the start of training step] datetime: 2024-11-27 12:50:16 
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7af6db40] mmco: unref short failure
+[h264 @ 0x563d7af6db40] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+  if self.device.type != 'cpu':
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[W ArgSortKernelNpuOpApi.cpp:26] Warning: Warning: kernel [ArgSort] can not support dtype int32 or int64 on AiCore, Now this kernel is running on AiCpu.If you are more concerned about high-performance execution,please cast dtype to float32. (function operator())
+[W ToKernelNpu.cpp:137] Warning: Warning: Device do not support double dtype now, dtype cast repalce with float. (function operator())
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+WARNING:megatron.core.models.common.embeddings.rotary_pos_embedding:Setting apply_rope_fusion to false because its implementation is not included in Apex. Try upgrading to the latest version
+................................................................................[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+  if self.device.type != 'cpu':
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+................[Rank 13] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0[Rank 8] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 37942.0 | max reserved: 37942.0
+
+[Rank 12] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0
+[Rank 5] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0
+[Rank 14] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0[Rank 15] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0
+
+[Rank 9] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0
+[Rank 11] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0[Rank 3] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0[Rank 1] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0
+
+
+[Rank 10] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0[Rank 6] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0
+
+[Rank 7] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0
+[Rank 4] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0[Rank 0] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35578.0 | max reserved: 35578.0
+
+[Rank 2] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+  if self.device.type != 'cpu':
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+  if self.device.type != 'cpu':
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+  if self.device.type != 'cpu':
+[h264 @ 0x563d7aa99bc0] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+  if self.device.type != 'cpu':
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d7aa99bc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+  if self.device.type != 'cpu':
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+  if self.device.type != 'cpu':
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7a499000] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7a499000] mmco: unref short failure
+[h264 @ 0x563d7a499000] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7a2328c0] mmco: unref short failure
+[h264 @ 0x563d7a2328c0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7a2328c0] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7d5f9480] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d7d5f9480] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d7d5f9480] mmco: unref short failure
+[h264 @ 0x563d7d5f9480] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0bf94d00] mmco: unref short failure
+[h264 @ 0x563d7d988380] mmco: unref short failure
+[h264 @ 0x55ef0a40fe40] mmco: unref short failure
+[h264 @ 0x55ef0a40fe40] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x563d7de83040] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x563d7de83040] mmco: unref short failure
+[h264 @ 0x563d7de83040] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x563d7de83040] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x563d78765400] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef08bb4480] mmco: unref short failure
+[h264 @ 0x55ef08bb4480] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d7a390940] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x563d7a390940] mmco: unref short failure
+[h264 @ 0x563d7a390940] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef0cc8d540] mmco: unref short failure
+[h264 @ 0x55ef0cc8d540] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79427c00] mmco: unref short failure
+[h264 @ 0x563d79427c00] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x55ef086a4300] mmco: unref short failure
+[h264 @ 0x55ef086a4300] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x55ef086a4300] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef086a4300] mmco: unref short failure
+[h264 @ 0x563d7a5354c0] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x563d7aa00740] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] Missing reference picture, default is 65530
+[h264 @ 0x55ef09d9a940] Missing reference picture, default is 65530
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] Missing reference picture, default is 65530
+[h264 @ 0x55ef09d9a940] Missing reference picture, default is 65530
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x563d7a391000] Missing reference picture, default is 65530
+[h264 @ 0x563d7a391000] Missing reference picture, default is 65530
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x563d7a391000] Missing reference picture, default is 65530
+[h264 @ 0x563d7a391000] Missing reference picture, default is 65530
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x55ef09d9a940] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0bf94d00] mmco: unref short failure
+[h264 @ 0x55ef0bf94d00] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef0bf94d00] mmco: unref short failure
+[h264 @ 0x55ef0bf94d00] mmco: unref short failure
+[h264 @ 0x55ef0bf94d00] mmco: unref short failure
+[h264 @ 0x55ef0bf94d00] mmco: unref short failure
+[h264 @ 0x55ef0bf94d00] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef0bf94d00] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef097cf680] mmco: unref short failure
+[h264 @ 0x55ef097cf680] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef097cf680] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef097cf680] mmco: unref short failure
+[h264 @ 0x55ef097cf680] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef097cf680] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef097cf680] mmco: unref short failure
+[h264 @ 0x55ef097cf680] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef097cf680] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d78854cc0] mmco: unref short failure
+[h264 @ 0x563d78854cc0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d78854cc0] mmco: unref short failure
+[h264 @ 0x563d78854cc0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d78854cc0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d78854cc0] mmco: unref short failure
+[h264 @ 0x563d78854cc0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d78854cc0] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x55ef0ce3e580] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7a730d00] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7a730d00] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d79716280] mmco: unref short failure
+[h264 @ 0x563d79716280] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d79716280] mmco: unref short failure
+[h264 @ 0x563d79716280] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d79716280] mmco: unref short failure
+[h264 @ 0x563d79716280] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef09310680] Missing reference picture, default is 65530
+[h264 @ 0x55ef09310680] Missing reference picture, default is 65530
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] Missing reference picture, default is 65530
+[h264 @ 0x55ef09310680] Missing reference picture, default is 65530
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78481100] Missing reference picture, default is 65530
+[h264 @ 0x563d78481100] Missing reference picture, default is 65530
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] Missing reference picture, default is 65530
+[h264 @ 0x563d78481100] Missing reference picture, default is 65530
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d7b381340] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x563d7b381340] mmco: unref short failure
+[h264 @ 0x563d7b381340] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x563d7b381340] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x563d7b381340] mmco: unref short failure
+[h264 @ 0x563d7b381340] mmco: unref short failure
+[h264 @ 0x563d7b381340] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d7b381340] mmco: unref short failure
+[h264 @ 0x563d7b381340] mmco: unref short failure
+[h264 @ 0x55ef082d0700] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x55ef082d0700] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d79b05a00] mmco: unref short failure
+[h264 @ 0x563d79b05a00] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d79b05a00] mmco: unref short failure
+[h264 @ 0x563d79b05a00] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef0d0817c0] mmco: unref short failure
+[h264 @ 0x55ef0d0817c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d7ede11c0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0cc8d540] mmco: unref short failure
+[h264 @ 0x55ef0cc8d540] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef0cc8d540] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef0cc8d540] mmco: unref short failure
+[h264 @ 0x55ef0cc8d540] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef0cc8d540] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef0cc8d540] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef084a7400] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef084a7400] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0a0e7580] mmco: unref short failure
+[h264 @ 0x55ef0a0e7580] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef0a0e7580] mmco: unref short failure
+[h264 @ 0x55ef0a0e7580] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0a962d40] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79259280] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x563d79b05a00] mmco: unref short failure
+[h264 @ 0x55ef0a8931c0] mmco: unref short failure
+[h264 @ 0x55ef0a8931c0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0a8931c0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0a8931c0] mmco: unref short failure
+[h264 @ 0x55ef0a8931c0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x563d785f04c0] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x55ef084a7400] mmco: unref short failure
+[h264 @ 0x55ef084a7400] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d7a730d00] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7ab61640] mmco: unref short failure
+[h264 @ 0x55ef084a7400] mmco: unref short failure
+[h264 @ 0x55ef084a7400] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d7ab61640] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x563d7ab61640] mmco: unref short failure
+[h264 @ 0x55ef08bb4480] mmco: unref short failure
+[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d79430500] mmco: unref short failure
+[h264 @ 0x563d79430500] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x563d79430500] mmco: unref short failure
+[h264 @ 0x563d79430500] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef07ff0680] mmco: unref short failure
+[h264 @ 0x563d7f1dbd40] mmco: unref short failure
+[h264 @ 0x563d7f1dbd40] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0a4ff100] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef098c3500] mmco: unref short failure
+[h264 @ 0x55ef098c3500] mmco: unref short failure
+[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
+[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
+[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
+[h264 @ 0x55ef098c3500] mmco: unref short failure
+[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
+[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
+[h264 @ 0x55ef098c3500] mmco: unref short failure
+[h264 @ 0x55ef098c3500] mmco: unref short failure
+[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
+[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
+[h264 @ 0x55ef098c3500] mmco: unref short failure
+[h264 @ 0x55ef098c3500] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x563d7ad55d40] mmco: unref short failure
+[h264 @ 0x563d7ad55d40] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x55ef098c3500] mmco: unref short failure
+[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d7ad55d40] mmco: unref short failure
+[h264 @ 0x563d7ad55d40] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d7ad55d40] mmco: unref short failure
+[h264 @ 0x563d7ad55d40] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x55ef08da0b00] mmco: unref short failure
+[h264 @ 0x563d79430500] mmco: unref short failure
+[h264 @ 0x55ef09310680] mmco: unref short failure
+[h264 @ 0x55ef09f6e780] mmco: unref short failure
+[h264 @ 0x55ef09f6e780] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7f409000] Missing reference picture, default is 65530
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d7f409000] Missing reference picture, default is 65530
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] Missing reference picture, default is 65530
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] Missing reference picture, default is 65530
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x563d80059280] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x55ef0a763a00] mmco: unref short failure
+[h264 @ 0x563d8102b140] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef080df5c0] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef080df5c0] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef080df5c0] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef080df5c0] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef080df5c0] mmco: unref short failure
+[h264 @ 0x563d7b050400] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x563d7b050400] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x55ef08bb4480] mmco: unref short failure
+[h264 @ 0x55ef08bb4480] mmco: unref short failure
+[h264 @ 0x563d79b05a00] mmco: unref short failure
+[h264 @ 0x563d79b05a00] mmco: unref short failure
+[h264 @ 0x55ef095231c0] mmco: unref short failure
+[h264 @ 0x55ef095231c0] mmco: unref short failure
+[h264 @ 0x563d7854ed00] mmco: unref short failure
+[h264 @ 0x563d7854ed00] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x563d7b050400] mmco: unref short failure
+[h264 @ 0x563d7b050400] mmco: unref short failure
+[h264 @ 0x55ef08a13500] mmco: unref short failure
+[h264 @ 0x563d7b050400] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d7a734f40] mmco: unref short failure
+[h264 @ 0x563d7a734f40] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d7a734f40] mmco: unref short failure
+[h264 @ 0x55ef08efc800] mmco: unref short failure
+[h264 @ 0x563d7a499000] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x563d7aa4a500] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x563d7aa4a500] mmco: unref short failure
+[h264 @ 0x563d7aa4a500] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09f6e780] mmco: unref short failure
+[h264 @ 0x55ef09f6e780] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a499000] mmco: unref short failure
+[h264 @ 0x563d7a499000] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x563d7aa4a500] mmco: unref short failure
+[h264 @ 0x55ef09f6e780] mmco: unref short failure
+[h264 @ 0x55ef09f6e780] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x55ef09bd1e40] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a499000] mmco: unref short failure
+[h264 @ 0x563d7a499000] mmco: unref short failure
+[h264 @ 0x55ef0ab24740] mmco: unref short failure
+[h264 @ 0x55ef0ab24740] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x563d7cf4ed40] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x55ef09a8ba80] mmco: unref short failure
+[h264 @ 0x55ef09a8ba80] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09f6e780] mmco: unref short failure
+[h264 @ 0x55ef09f6e780] mmco: unref short failure
+[h264 @ 0x563d79152140] mmco: unref short failure
+[h264 @ 0x55ef09a8ba80] mmco: unref short failure
+[h264 @ 0x55ef09a8ba80] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef09a8ba80] mmco: unref short failure
+[h264 @ 0x55ef09a8ba80] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x563d78e5bc00] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef08a3be00] mmco: unref short failure
+[h264 @ 0x55ef08a3be00] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x55ef08bb4480] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d78976080] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x563d79a79e40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x563d78976080] mmco: unref short failure
+[h264 @ 0x563d78976080] mmco: unref short failure
+[h264 @ 0x563d78976080] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x563d78976080] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d78965e40] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+saving checkpoint at iteration     100 to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// in torch format
+  successfully saved checkpoint at iteration     100 to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef095231c0] mmco: unref short failure
+[h264 @ 0x55ef095231c0] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x563d7a6f72c0] mmco: unref short failure
+[h264 @ 0x55ef07fe90c0] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d786b2140] mmco: unref short failure
+[h264 @ 0x563d8102b140] mmco: unref short failure
+[h264 @ 0x55ef0a763a00] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x563d7fab1ec0] mmco: unref short failure
+[h264 @ 0x563d7fab1ec0] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0791b8c0] mmco: unref short failure
+[h264 @ 0x55ef0791b8c0] mmco: unref short failure
+[h264 @ 0x55ef0791b8c0] mmco: unref short failure
+[h264 @ 0x55ef0791b8c0] mmco: unref short failure
+[h264 @ 0x563d7fab1ec0] mmco: unref short failure
+[h264 @ 0x563d7fab1ec0] mmco: unref short failure
+[h264 @ 0x563d7fab1ec0] mmco: unref short failure
+[h264 @ 0x563d7fab1ec0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x563d7aa4a500] mmco: unref short failure
+[h264 @ 0x563d7aa4a500] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d794bc940] mmco: unref short failure
+[h264 @ 0x563d794bc940] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d794bc940] mmco: unref short failure
+[h264 @ 0x563d794bc940] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08efc800] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79430500] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x563d7933e200] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d7af1d480] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x563d790f2bc0] mmco: unref short failure
+[h264 @ 0x55ef095231c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d7af1d480] mmco: unref short failure
+[h264 @ 0x563d7af1d480] mmco: unref short failure
+[h264 @ 0x563d7af1d480] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x55ef082db000] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef08f34980] mmco: unref short failure
+[h264 @ 0x563d79301700] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+processed_samples 500 unjoint_samples 500 joint_samples 34 [105813, 125764]
+processed_samples 500 unjoint_samples 500 joint_samples 32 [112016, 108623]
+processed_samples 500 unjoint_samples 500 joint_samples 34 [105813, 125764]
+processed_samples 500 unjoint_samples 500 joint_samples 34 [130043, 58386]
+processed_samples 500 unjoint_samples 500 joint_samples 32 [112016, 108623]
+processed_samples 500 unjoint_samples 500 joint_samples 34 [130043, 58386]
+processed_samples 500 unjoint_samples 500 joint_samples 33 [53100, 129317]
+processed_samples 500 unjoint_samples 500 joint_samples 33 [53100, 129317]
+processed_samples 500 unjoint_samples 500 joint_samples 31 [112490, 118404]
+processed_samples 500 unjoint_samples 500 joint_samples 31 [112490, 118404]
+processed_samples 500 unjoint_samples 500 joint_samples 35 [114617, 91626]
+processed_samples 500 unjoint_samples 500 joint_samples 35 [114617, 91626]
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+processed_samples 500 unjoint_samples 500 joint_samples 33 [114594, 126569]
+processed_samples 500 unjoint_samples 500 joint_samples 33 [114594, 126569]
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+processed_samples 501 unjoint_samples 500 joint_samples 33 [116777, 82863]
+processed_samples 501 unjoint_samples 500 joint_samples 33 [116777, 82863]
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0842e940] mmco: unref short failure
+[h264 @ 0x55ef0842e940] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x55ef0923b380] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
+[h264 @ 0x563d7e66b540] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x563d7fab1ec0] mmco: unref short failure
+[h264 @ 0x55ef0d4bc780] mmco: unref short failure
+[h264 @ 0x563d7fab1ec0] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x563d7969f2c0] mmco: unref short failure
+[h264 @ 0x563d7969f2c0] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x563d7857cd00] mmco: unref short failure
+[h264 @ 0x563d7857cd00] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d7a391000] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef08efb740] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0842e940] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x55ef0842e940] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x55ef0842e940] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d79b720c0] mmco: unref short failure
+[h264 @ 0x55ef09486000] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef082d0700] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef082d0700] mmco: unref short failure
+[h264 @ 0x55ef082d0700] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef082d0700] mmco: unref short failure
+[h264 @ 0x55ef082d0700] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef082d0700] mmco: unref short failure
+[h264 @ 0x55ef082d0700] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7ad55d40] mmco: unref short failure
+[h264 @ 0x563d7ad55d40] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x55ef08434780] mmco: unref short failure
+[h264 @ 0x55ef0842e940] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x563d7854ed00] mmco: unref short failure
+[h264 @ 0x563d7854ed00] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x55ef0a763a00] mmco: unref short failure
+[h264 @ 0x563d7b050400] mmco: unref short failure
+[h264 @ 0x55ef0842e940] mmco: unref short failure
+[h264 @ 0x563d79138840] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0a763a00] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0a763a00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d7ab61640] mmco: unref short failure
+[h264 @ 0x55ef09cec5c0] mmco: unref short failure
+[h264 @ 0x55ef09cec5c0] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09cec5c0] mmco: unref short failure
+[h264 @ 0x55ef09cec5c0] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d8137ee00] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef096f1180] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef10b42940] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x55ef093fd000] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x55ef0a99d440] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x563d799469c0] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x55ef0a871380] mmco: unref short failure
+[h264 @ 0x563d78936cc0] mmco: unref short failure
+[h264 @ 0x563d78936cc0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d78edf780] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d79426840] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d78d8bb40] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef0868b9c0] mmco: unref short failure
+[h264 @ 0x563d7ba87580] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x55ef09b07ac0] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d785b03c0] mmco: unref short failure
+[h264 @ 0x563d785b03c0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d785b03c0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef0a394b80] mmco: unref short failure
+[h264 @ 0x55ef0a394b80] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef0a394b80] mmco: unref short failure
+[h264 @ 0x55ef0a394b80] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef0add9980] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d785b03c0] mmco: unref short failure
+[h264 @ 0x563d785b03c0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d799ea700] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef08720c80] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x55ef0ac5e940] mmco: unref short failure
+[h264 @ 0x563d78b95d80] mmco: unref short failure
+[h264 @ 0x563d7f409000] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d785b03c0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x563d7945b680] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d79b03d00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0add9980] mmco: unref short failure
+[h264 @ 0x55ef0add9980] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08bb1f80] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d7909ae00] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x563d797a5f40] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x55ef0ab31f00] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x563d793dc200] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d79c29240] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x55ef0ac62640] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef08630300] mmco: unref short failure
+[h264 @ 0x563d7973ac40] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x55ef07f7c280] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x563d786d4100] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x563d7e4f3f00] mmco: unref short failure
+[h264 @ 0x563d7e4f3f00] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d7af1d480] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d7af1d480] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d7af1d480] mmco: unref short failure
+[h264 @ 0x563d7af1d480] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d7af1d480] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d79427c00] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x55ef08aa5000] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x55ef08ef8240] mmco: unref short failure
+[h264 @ 0x563d79570bc0] mmco: unref short failure
+[h264 @ 0x563d79570bc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x55ef09fb8c00] mmco: unref short failure
+[h264 @ 0x563d79570bc0] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef0c609180] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x563d7a649840] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x563d7a81ea40] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7aaa36c0] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7aaa36c0] mmco: unref short failure
+[h264 @ 0x563d7aaa36c0] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7aaa36c0] mmco: unref short failure
+[h264 @ 0x563d7aaa36c0] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7aaa36c0] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7aaa36c0] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7aaa36c0] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0abb24c0] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x563d78e4e240] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x55ef0d69af00] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x563d7a722400] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef0941a540] mmco: unref short failure
+[h264 @ 0x563d78b74a80] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
+[h264 @ 0x563d7990bd80] mmco: unref short failure
+[h264 @ 0x55ef09e96ac0] mmco: unref short failure
+[h264 @ 0x55ef0a8389c0] mmco: unref short failure
+[h264 @ 0x563d7b050400] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d7944a640] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d79c28ac0] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x55ef098f0800] mmco: unref short failure
+[h264 @ 0x563d78481100] mmco: unref short failure
+[h264 @ 0x563d7a3e47c0] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x563d7a3e47c0] mmco: unref short failure
+[h264 @ 0x55ef113c5d00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x55ef0a1c5980] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x55ef0a823a00] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x563d791e6cc0] mmco: unref short failure
+[h264 @ 0x55ef0aadf440] mmco: unref short failure
+[h264 @ 0x563d7de7dd00] mmco: unref short failure
+[h264 @ 0x55ef09718280] mmco: unref short failure
+[h264 @ 0x563d8113fbc0] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x55ef0a169480] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+[h264 @ 0x563d78c05240] mmco: unref short failure
+saving checkpoint at iteration     200 to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// in torch format