diff --git "a/log_node0.txt" "b/log_node0.txt" new file mode 100644--- /dev/null +++ "b/log_node0.txt" @@ -0,0 +1,20471 @@ ++ echo Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node0.txt +Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node0.txt ++ export ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0 ++ ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0 ++ mkdir -p /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0 ++ DATA_PATH=/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml ++ TOKENIZER_PATH=/data_4/models/Qwen/Qwen2.5-14B-Instruct/ ++ CKPT_LOAD_DIR=/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ ++ VIT_CKPT_LOAD_DIR=/ ++ CKPT_SAVE_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// ++ rsync -avh /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/ +sending incremental file list + +sent 71 bytes received 12 bytes 166.00 bytes/sec +total size is 23.84K speedup is 287.17 ++ cd /local_disk/cognitron_vl/ ++ rm -fr datasets ++ mkdir -p datasets ++ ln -s /data/data/ datasets/CV ++ ln -s /data/data/LLM datasets/LLM ++ ln -s /data/data/LMM datasets/LMM ++ source /local_disk/cognitron_vl//scripts/set_env_mg_npu.sh +++ source /usr/local/Ascend/driver/bin/setenv.bash ++++ DEP_INFO_FILE=/etc/ascend_install.info ++++ [[ -f /etc/ascend_install.info ]] ++++ . /etc/ascend_install.info ++++ DRV_LIB64_COMMON_LDPATH=/driver/lib64/common ++++ DRV_LIB64_DRV_LDPATH=/driver/lib64/driver ++++ DRV_LIB64_LDPATH=/driver/lib64 ++++ export LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin ++++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin +++ source /usr/local/Ascend/ascend-toolkit/set_env.sh ++++ export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest ++++ ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest +++++ arch ++++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: ++++ PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: ++++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin ++++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin ++++ export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest ++++ ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest ++++ export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp ++++ ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp ++++ export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit ++++ TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit ++++ export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest ++++ ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest +++ export HCCL_CONNECT_TIMEOUT=7200 +++ HCCL_CONNECT_TIMEOUT=7200 +++ export HCCL_EXEC_TIMEOUT=7200 +++ HCCL_EXEC_TIMEOUT=7200 +++ export COMBINED_ENABLE=1 +++ COMBINED_ENABLE=1 +++ export MULTI_STREAM_MEMORY_REUSE=1 +++ MULTI_STREAM_MEMORY_REUSE=1 +++ export HCCL_RDMA_TC=160 +++ HCCL_RDMA_TC=160 +++ export HCCL_RDMA_SL=5 +++ HCCL_RDMA_SL=5 +++ export HCCL_INTRA_PCIE_ENABLE=0 +++ HCCL_INTRA_PCIE_ENABLE=0 +++ export HCCL_INTRA_ROCE_ENABLE=1 +++ HCCL_INTRA_ROCE_ENABLE=1 +++ export HCCL_RDMA_TIMEOUT=20 +++ HCCL_RDMA_TIMEOUT=20 +++ export INF_NAN_MODE_ENABLE=1 +++ INF_NAN_MODE_ENABLE=1 +++ export DISTRIBUTED_BACKEND=hccl +++ DISTRIBUTED_BACKEND=hccl +++ export ASCEND_LAUNCH_BLOCKING=0 +++ ASCEND_LAUNCH_BLOCKING=0 +++ export ASCEND_SLOG_PRINT_TO_STDOUT=0 +++ ASCEND_SLOG_PRINT_TO_STDOUT=0 +++ export ASCEND_GLOBAL_LOG_LEVEL=3 +++ ASCEND_GLOBAL_LOG_LEVEL=3 +++ export ASCEND_GLOBAL_EVENT_ENABLE=0 +++ ASCEND_GLOBAL_EVENT_ENABLE=0 +++ export TASK_QUEUE_ENABLE=1 +++ TASK_QUEUE_ENABLE=1 +++ export PTCOPY_ENABLE=1 +++ PTCOPY_ENABLE=1 +++ export COMBINED_ENABLE=1 +++ COMBINED_ENABLE=1 +++ export DYNAMIC_OP=ADD#MUL +++ DYNAMIC_OP=ADD#MUL +++ export HCCL_WHITELIST_DISABLE=1 +++ HCCL_WHITELIST_DISABLE=1 +++ export HCCL_CONNECT_TIMEOUT=7200 +++ HCCL_CONNECT_TIMEOUT=7200 +++ export HCCL_WHITELIST_DISABLE=1 +++ HCCL_WHITELIST_DISABLE=1 +++ export CUDA_DEVICE_MAX_CONNECTIONS=1 +++ CUDA_DEVICE_MAX_CONNECTIONS=1 +++ pip3 install --no-index --find-links=/data/software/ -r requirements_npu.txt +Looking in links: /data/software/ +Processing data/software/expecttest-0.2.1-py3-none-any.whl (from -r requirements_npu.txt (line 1)) +Requirement already satisfied: peft in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 2)) (0.7.0) +Processing data/software/XlsxWriter-3.2.0-py3-none-any.whl (from -r requirements_npu.txt (line 3)) +Requirement already satisfied: termcolor in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 4)) (2.4.0) +Requirement already satisfied: tabulate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 5)) (0.9.0) +Processing data/software/tiktoken-0.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 6)) +Requirement already satisfied: matplotlib in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 7)) (3.7.5) +Processing data/software/datasets-3.0.0-py3-none-any.whl (from -r requirements_npu.txt (line 8)) +Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 9)) (0.7.0) +Processing data/software/pybind11-2.13.6-py3-none-any.whl (from -r requirements_npu.txt (line 10)) +Requirement already satisfied: tensorboardX in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 11)) (2.6.2.2) +Processing data/software/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 12)) +Requirement already satisfied: transformers>=4.40.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 13)) (4.40.1) +Requirement already satisfied: deepspeed>=0.14.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 14)) (0.14.5) +Processing data/software/accelerate-0.34.2-py3-none-any.whl (from -r requirements_npu.txt (line 15)) +Requirement already satisfied: timm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 16)) (0.9.16) +Processing data/software/flask-3.0.3-py3-none-any.whl (from -r requirements_npu.txt (line 17)) +Processing data/software/Flask_RESTful-0.3.10-py2.py3-none-any.whl (from -r requirements_npu.txt (line 18)) +Processing data/software/decord-0.6.0-py3-none-manylinux2010_x86_64.whl (from -r requirements_npu.txt (line 19)) +Processing data/software/natsort-8.4.0-py3-none-any.whl (from -r requirements_npu.txt (line 20)) +Requirement already satisfied: numpy>=1.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (1.24.4) +Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (23.2) +Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.9.8) +Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.4.1) +Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (2.1.0+cpu) +Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (4.66.2) +Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.4.2) +Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.20.3) +Requirement already satisfied: regex>=2022.1.18 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2023.12.25) +Requirement already satisfied: requests>=2.26.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2.31.0) +Requirement already satisfied: contourpy>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.1.1) +Requirement already satisfied: cycler>=0.10 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (0.12.1) +Requirement already satisfied: fonttools>=4.22.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (4.49.0) +Requirement already satisfied: kiwisolver>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.4.5) +Requirement already satisfied: pillow>=6.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (10.2.0) +Requirement already satisfied: pyparsing>=2.3.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (3.1.1) +Requirement already satisfied: python-dateutil>=2.7 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (2.8.2) +Requirement already satisfied: importlib-resources>=3.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (6.1.2) +Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.13.1) +Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.3.7) +Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2.0.3) +Processing data/software/requests-2.32.3-py3-none-any.whl (from tiktoken->-r requirements_npu.txt (line 6)) +Processing data/software/tqdm-4.67.1-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2)) +Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.4.1) +Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.70.15) +Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2023.10.0) +Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.9.3) +Processing data/software/huggingface_hub-0.26.2-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2)) +Requirement already satisfied: protobuf>=3.20 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tensorboardX->-r requirements_npu.txt (line 11)) (4.25.3) +Requirement already satisfied: tokenizers<0.20,>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers>=4.40.1->-r requirements_npu.txt (line 13)) (0.19.1) +Requirement already satisfied: hjson in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (3.1.0) +Requirement already satisfied: ninja in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.11.1.1) +Requirement already satisfied: nvidia-ml-py in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (12.560.30) +Requirement already satisfied: py-cpuinfo in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (9.0.0) +Requirement already satisfied: pydantic in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.10.15) +Processing data/software/safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from peft->-r requirements_npu.txt (line 2)) +Requirement already satisfied: torchvision in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from timm->-r requirements_npu.txt (line 16)) (0.16.0) +Requirement already satisfied: Werkzeug>=3.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.0.1) +Requirement already satisfied: Jinja2>=3.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.1.3) +Processing data/software/itsdangerous-2.2.0-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17)) +Requirement already satisfied: click>=8.1.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (8.1.7) +Processing data/software/blinker-1.8.2-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17)) +Requirement already satisfied: importlib-metadata>=3.6.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (7.0.1) +Processing data/software/aniso8601-9.0.1-py2.py3-none-any.whl (from flask_restful->-r requirements_npu.txt (line 18)) +Requirement already satisfied: six>=1.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (1.16.0) +Requirement already satisfied: pytz in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (2024.1) +Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.3.1) +Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (23.2.0) +Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.4.1) +Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (6.0.5) +Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.9.4) +Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (4.0.3) +Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft->-r requirements_npu.txt (line 2)) (4.10.0) +Requirement already satisfied: zipp>=0.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from importlib-metadata>=3.6.0->flask->-r requirements_npu.txt (line 17)) (3.17.0) +Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from Jinja2>=3.1.2->flask->-r requirements_npu.txt (line 17)) (2.1.5) +Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.3.2) +Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.6) +Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (1.26.18) +Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (2024.2.2) +Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.4) +Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (3.1) +Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2024.1) +Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.3.0) +DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 +Installing collected packages: aniso8601, xlsxwriter, tqdm, safetensors, requests, pybind11, pyarrow, natsort, itsdangerous, expecttest, decord, blinker, tiktoken, huggingface-hub, flask, flask_restful, accelerate, datasets + Attempting uninstall: tqdm + Found existing installation: tqdm 4.66.2 + Uninstalling tqdm-4.66.2: + Successfully uninstalled tqdm-4.66.2 + Attempting uninstall: safetensors + Found existing installation: safetensors 0.4.2 + Uninstalling safetensors-0.4.2: + Successfully uninstalled safetensors-0.4.2 + Attempting uninstall: requests + Found existing installation: requests 2.31.0 + Uninstalling requests-2.31.0: + Successfully uninstalled requests-2.31.0 + Attempting uninstall: pyarrow + Found existing installation: pyarrow 15.0.0 + Uninstalling pyarrow-15.0.0: + Successfully uninstalled pyarrow-15.0.0 + Attempting uninstall: huggingface-hub + Found existing installation: huggingface-hub 0.20.3 + Uninstalling huggingface-hub-0.20.3: + Successfully uninstalled huggingface-hub-0.20.3 + Attempting uninstall: accelerate + Found existing installation: accelerate 0.25.0 + Uninstalling accelerate-0.25.0: + Successfully uninstalled accelerate-0.25.0 + Attempting uninstall: datasets + Found existing installation: datasets 2.16.0 + Uninstalling datasets-2.16.0: + Successfully uninstalled datasets-2.16.0 +ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. +tikit 1.8.2.240926 requires dicttoxml==1.7.4, which is not installed. +tikit 1.8.2.240926 requires docopt==0.6.2, which is not installed. +tikit 1.8.2.240926 requires future==0.18.2, which is not installed. +tikit 1.8.2.240926 requires hdfs==2.6.0, which is not installed. +tikit 1.8.2.240926 requires pure-sasl==0.6.2, which is not installed. +tikit 1.8.2.240926 requires py4j==0.10.7, which is not installed. +tikit 1.8.2.240926 requires PyHive[hive]==0.6.4, which is not installed. +tikit 1.8.2.240926 requires pyjwt>=2.4.0, which is not installed. +tikit 1.8.2.240926 requires requests-kerberos>=0.14.0, which is not installed. +tikit 1.8.2.240926 requires sasl==0.3.1, which is not installed. +tikit 1.8.2.240926 requires thrift==0.15.0, which is not installed. +tikit 1.8.2.240926 requires thrift-sasl>=0.1.0, which is not installed. +tikit 1.8.2.240926 requires certifi==2021.10.8, but you have certifi 2024.2.2 which is incompatible. +tikit 1.8.2.240926 requires cos-python-sdk-v5==1.9.29, but you have cos-python-sdk-v5 1.9.26 which is incompatible. +tikit 1.8.2.240926 requires idna==3.3, but you have idna 3.6 which is incompatible. +tikit 1.8.2.240926 requires prettytable==2.5.0, but you have prettytable 3.11.0 which is incompatible. +tikit 1.8.2.240926 requires urllib3==1.26.7, but you have urllib3 1.26.18 which is incompatible. +tikit 1.8.2.240926 requires wcwidth==0.2.5, but you have wcwidth 0.2.13 which is incompatible. +Successfully installed accelerate-0.34.2 aniso8601-9.0.1 blinker-1.8.2 datasets-3.0.0 decord-0.6.0 expecttest-0.2.1 flask-3.0.3 flask_restful-0.3.10 huggingface-hub-0.26.2 itsdangerous-2.2.0 natsort-8.4.0 pyarrow-17.0.0 pybind11-2.13.6 requests-2.32.3 safetensors-0.4.5 tiktoken-0.7.0 tqdm-4.67.1 xlsxwriter-3.2.0 +WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv +++ return 0 ++ MEGATRON_DIR=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/ ++ MINDSPEED_DIR=/local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/ ++ MODELLINK_DIR=/local_disk/cognitron_vl//third_party/ModelLink/ ++ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/ +Looking in links: /data/software/ +Obtaining file://local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0 + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Checking if build backend supports build_editable: started + Checking if build backend supports build_editable: finished with status 'done' + Getting requirements to build editable: started + Getting requirements to build editable: finished with status 'done' + Installing backend dependencies: started + Installing backend dependencies: finished with status 'done' + Preparing editable metadata (pyproject.toml): started + Preparing editable metadata (pyproject.toml): finished with status 'done' +Building wheels for collected packages: megatron_core + Building editable for megatron_core (pyproject.toml): started + Building editable for megatron_core (pyproject.toml): finished with status 'done' + Created wheel for megatron_core: filename=megatron_core-0.6.0-0.editable-cp38-cp38-linux_x86_64.whl size=8791 sha256=06d5bd071b6eadb2bc6965a495bd802172dae415af74dd60b1478328d6910bcd + Stored in directory: /tmp/pip-ephem-wheel-cache-m7mtfhay/wheels/54/9c/d1/d2015aa0c34e791e64d65d19395e5a9a5528f0c63fd519b9ff +Successfully built megatron_core +DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 +Installing collected packages: megatron_core +Successfully installed megatron_core-0.6.0 +WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv ++ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/ +Looking in links: /data/software/ +Obtaining file://local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0 + Preparing metadata (setup.py): started + Preparing metadata (setup.py): finished with status 'done' +WARNING: Error parsing requirements for tokenizers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/tokenizers-0.19.1.dist-info/METADATA' +WARNING: Error parsing requirements for transformers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/transformers-4.40.1.dist-info/METADATA' +DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 +Installing collected packages: mindspeed + Running setup.py develop for mindspeed +Successfully installed mindspeed-0.6.0 +WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv ++ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/ModelLink/ +Looking in links: /data/software/ +Obtaining file://local_disk/cognitron_vl/third_party/ModelLink + Preparing metadata (setup.py): started + Preparing metadata (setup.py): finished with status 'done' +Requirement already satisfied: numpy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.24.4) +Processing data/software/transformers-4.43.2-py3-none-any.whl (from modellink==0.0.1) +Processing data/software/transformers-stream-generator-0.0.5.tar.gz (from modellink==0.0.1) + Preparing metadata (setup.py): started + Preparing metadata (setup.py): finished with status 'done' +Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.4) +Requirement already satisfied: decorator in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (5.1.1) +Requirement already satisfied: scipy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.10.1) +Requirement already satisfied: sentencepiece in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.2.0) +Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0) +Requirement already satisfied: datasets in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (3.0.0) +Requirement already satisfied: pybind11 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (2.13.6) +Requirement already satisfied: accelerate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.34.2) +Requirement already satisfied: six in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.16.0) +Requirement already satisfied: protobuf in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (4.25.3) +Processing data/software/peft-0.7.1-py3-none-any.whl (from modellink==0.0.1) +Requirement already satisfied: tiktoken in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0) +Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (23.2) +Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.9.8) +Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.4.1) +Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (2.1.0+cpu) +Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (4.67.1) +Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.4.5) +Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.26.2) +Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (3.13.1) +Requirement already satisfied: regex!=2019.12.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2023.12.25) +Requirement already satisfied: requests in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2.32.3) +Processing data/software/tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from transformers==4.43.2->modellink==0.0.1) +Requirement already satisfied: pyarrow>=15.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (17.0.0) +Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.3.7) +Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (2.0.3) +Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.4.1) +Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.70.15) +Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets->modellink==0.0.1) (2023.10.0) +Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.9.3) +Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->modellink==0.0.1) (1.3.0) +Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.3.1) +Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (23.2.0) +Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.4.1) +Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (6.0.5) +Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.9.4) +Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (4.0.3) +Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft==0.7.1->modellink==0.0.1) (4.10.0) +Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.3.2) +Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.6) +Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (1.26.18) +Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (2024.2.2) +Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1) +Requirement already satisfied: jinja2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1.3) +Requirement already satisfied: python-dateutil>=2.8.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2.8.2) +Requirement already satisfied: pytz>=2020.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1) +Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1) +Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from jinja2->torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (2.1.5) +Building wheels for collected packages: transformers_stream_generator + Building wheel for transformers_stream_generator (setup.py): started + Building wheel for transformers_stream_generator (setup.py): finished with status 'done' + Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=3ed62a866ab10917ceed94a0bafc0596380802f798ed67b7de78b76fe0b65f1f + Stored in directory: /root/.cache/pip/wheels/56/8c/42/5381d9c36bc85f28982f4cf8f98dc44d37a6d6c04897a5cb7c +Successfully built transformers_stream_generator +DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 +Installing collected packages: tokenizers, transformers, transformers_stream_generator, peft, modellink + Attempting uninstall: tokenizers + Found existing installation: tokenizers 0.20.3 + Uninstalling tokenizers-0.20.3: + Successfully uninstalled tokenizers-0.20.3 + Attempting uninstall: transformers + Found existing installation: transformers 4.46.3 + Uninstalling transformers-4.46.3: + Successfully uninstalled transformers-4.46.3 + Attempting uninstall: peft + Found existing installation: peft 0.7.0 + Uninstalling peft-0.7.0: + Successfully uninstalled peft-0.7.0 + Running setup.py develop for modellink +Successfully installed modellink-0.0.1 peft-0.7.1 tokenizers-0.19.1 transformers-4.43.2 transformers_stream_generator-0.0.5 +WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv ++ export PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: ++ PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: ++ GPUS_PER_NODE=16 ++ NNODES=32 ++ NODE_RANK=0 ++ MASTER_PORT=34567 ++ export CUDA_DEVICE_MAX_CONNECTIONS=1 ++ CUDA_DEVICE_MAX_CONNECTIONS=1 ++ VISION_SEQ_LENGTH=1025 ++ IMAGE_TOKEN_LENGTH=256 ++ IMAGE_SIZE=448 ++ VISION_MODEL_TYPE=intern_300m ++ TP=8 ++ PP=1 ++ CP=2 ++ CP_ALGO=megatron_cp_algo ++ CP_MASK=causal ++ DISTRIBUTED_ARGS=' + --nproc_per_node 16 --nnodes 32 --node_rank 0 --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local --master_port 34567 +' ++ GPT_ARGS=' + --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 131072 --max-position-embeddings 131072 --micro-batch-size 1 --global-batch-size 64 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 1000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-frame 512 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --cross-dataset-joint ' ++ DATA_ARGS=' + --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml --split 100,0,0 --data-seq-length 131072 --num-workers 8 ' ++ CKPT_ARGS=' + --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ --vit-load / --no-load-optim --no-load-rng --seed 424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// ' ++ OUTPUT_ARGS=' + --log-interval 1 --save-interval 100 --eval-interval 100 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 ' ++ torchrun --nproc_per_node 16 --nnodes 32 --node_rank 0 --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local --master_port 34567 /local_disk/cognitron_vl//lcvlm_modellink/pretrain_lcvlm.py --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 131072 --max-position-embeddings 131072 --micro-batch-size 1 --global-batch-size 64 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 1000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-frame 512 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --cross-dataset-joint --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml --split 100,0,0 --data-seq-length 131072 --num-workers 8 --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ --vit-load / --no-load-optim --no-load-rng --seed 424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// --log-interval 1 --save-interval 100 --eval-interval 100 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 --distributed-backend nccl +[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] +[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] ***************************************** +[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] ***************************************** +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/contrib/transfer_to_npu.py:299: ImportWarning: + ************************************************************************************************************* + The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.npu and torch.nn.Module.npu now.. + The torch.cuda.DoubleTensor is replaced with torch.npu.FloatTensor cause the double type is not supported now.. + The backend in torch.distributed.init_process_group set to hccl now.. + The torch.cuda.* and torch.cuda.amp.* are replaced with torch.npu.* and torch.npu.amp.* now.. + The device parameters have been replaced with npu in the function below: + torch.logspace, torch.randint, torch.hann_window, torch.rand, torch.full_like, torch.ones_like, torch.rand_like, torch.randperm, torch.arange, torch.frombuffer, torch.normal, torch._empty_per_channel_affine_quantized, torch.empty_strided, torch.empty_like, torch.scalar_tensor, torch.tril_indices, torch.bartlett_window, torch.ones, torch.sparse_coo_tensor, torch.randn, torch.kaiser_window, torch.tensor, torch.triu_indices, torch.as_tensor, torch.zeros, torch.randint_like, torch.full, torch.eye, torch._sparse_csr_tensor_unsafe, torch.empty, torch._sparse_coo_tensor_unsafe, torch.blackman_window, torch.zeros_like, torch.range, torch.sparse_csr_tensor, torch.randn_like, torch.from_file, torch._cudnn_init_dropout_state, torch._empty_affine_quantized, torch.linspace, torch.hamming_window, torch.empty_quantized, torch._pin_memory, torch.autocast, torch.load, torch.Generator, torch.set_default_device, torch.Tensor.new_empty, torch.Tensor.new_empty_strided, torch.Tensor.new_full, torch.Tensor.new_ones, torch.Tensor.new_tensor, torch.Tensor.new_zeros, torch.Tensor.to, torch.nn.Module.to, torch.nn.Module.to_empty + ************************************************************************************************************* + + warnings.warn(msg, ImportWarning) +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/contrib/transfer_to_npu.py:260: RuntimeWarning: torch.jit.script and torch.jit.script_method will be disabled by transfer_to_npu, which currently does not support them, if you need to enable them, please do not use transfer_to_npu. + warnings.warn(msg, RuntimeWarning) +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp... +Emitting ninja build file /root/.cache/torch_extensions/py38_cpu/adaptive_cp/build.ninja... +Building extension module adaptive_cp... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +[1/2] c++ -MMD -MF adaptive_cp.o.d -DTORCH_EXTENSION_NAME=adaptive_cp -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/Ascend/ascend-toolkit/latest/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/third_party -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/acl -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/inc -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/TH -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/THC -isystem /root/miniconda3/envs/py38/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -fPIC -pie -Wl,--disable-new-dtags,--rpath -s -O2 -c local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/ops/csrc/algorithm/adaptive_cp/adaptive_cp.cpp -o adaptive_cp.o +[2/2] c++ adaptive_cp.o -shared -L/usr/local/Ascend/ascend-toolkit/latest/lib64 -lascendcl -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/lib -ltorch_npu -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o adaptive_cp.so +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +using world size: 512, data-parallel size: 32, context-parallel size: 2 tensor-model-parallel size: 8, pipeline-model-parallel size: 1 +WARNING: Setting args.overlap_p2p_comm to False since non-interleaved schedule does not support overlapping p2p communication +accumulate and all-reduce gradients in fp32 for bfloat16 data type. +using torch.bfloat16 for parameters ... +[INFO] Setting args.use_flash_attn=True since context parallel is enabled. +[INFO] Setting args.create_attention_mask_in_dataloader to False since reset_data=False or alibi_without_flash_attn=False or args.tokenizer_padding_side=right +------------------------ ModelLink Arguments ------------------------ + accumulate_allreduce_grads_in_fp32 .............. True + adam_beta1 ...................................... 0.9 + adam_beta2 ...................................... 0.999 + adam_eps ........................................ 1e-08 + adaptive_cp_dynamic_attn_mask ................... False + adaptive_cp_manually_set_mask_list .............. False + adaptive_cp_only_reschedule ..................... False + adaptive_cp_without_coarse ...................... False + adaptive_recompute_device_size .................. -1 + adaptive_recompute_device_swap .................. False + adaptive_recompute_profiling_step ............... 10 + add_bias_linear ................................. False + add_class_token ................................. True + add_dense_bias .................................. False + add_position_embedding .......................... True + add_qkv_bias .................................... True + add_rmsnorm_offset .............................. False + adlr_autoresume ................................. False + adlr_autoresume_interval ........................ 1000 + apply_layernorm_1p .............................. False + apply_query_key_layer_scaling ................... False + apply_residual_connection_post_layernorm ........ False + apply_rope_fusion ............................... True + async_tensor_model_parallel_allreduce ........... False + attention_dropout ............................... 0.0 + attention_mask_on_cpu ........................... False + attention_softmax_in_fp32 ....................... True + attn_logit_softcapping .......................... None + auto_detect_ckpt_format ......................... False + barrier_with_L1_time ............................ True + bert_binary_head ................................ True + bert_embedder_type .............................. megatron + bert_load ....................................... None + bf16 ............................................ True + bias_dropout_fusion ............................. True + bias_gelu_fusion ................................ False + bias_swiglu_fusion .............................. True + biencoder_projection_dim ........................ 0 + biencoder_shared_query_context_model ............ False + block_data_path ................................. None + check_for_nan_in_loss_and_grad .................. True + chunk_size ...................................... 4096 + ckpt_fully_parallel_save ........................ False + ckpt_step ....................................... None + classes_fraction ................................ 1.0 + clip_grad ....................................... 1.0 + clone_scatter_output_in_embedding ............... True + consumed_train_samples .......................... 0 + consumed_valid_samples .......................... 0 + context_parallel_algo ........................... megatron_cp_algo + context_parallel_size ........................... 2 + cp_attention_mask_type .......................... causal + cp_window_size .................................. 1 + create_attention_mask_in_dataloader ............. False + cross_dataset_joint ............................. True + data_cache_path ................................. None + data_parallel_random_init ....................... False + data_parallel_size .............................. 32 + data_path ....................................... ['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml'] + data_per_class_fraction ......................... 1.0 + data_seq_length ................................. 131072 + data_sharding ................................... True + dataloader_type ................................. single + decoder_num_layers .............................. None + decoder_seq_length .............................. None + decoupled_lr .................................... None + decoupled_min_lr ................................ None + delay_grad_reduce ............................... True + delay_param_gather .............................. False + dim_model_base .................................. None + dino_bottleneck_size ............................ 256 + dino_freeze_last_layer .......................... 1 + dino_head_hidden_size ........................... 2048 + dino_local_crops_number ......................... 10 + dino_local_img_size ............................. 96 + dino_norm_last_layer ............................ False + dino_teacher_temp ............................... 0.07 + dino_warmup_teacher_temp ........................ 0.04 + dino_warmup_teacher_temp_epochs ................. 30 + dist_ckpt_format ................................ torch_dist + distribute_saved_activations .................... False + distributed_backend ............................. nccl + distributed_timeout_minutes ..................... 120 + dpo_beta ........................................ 0.1 + dpo_ftx ......................................... 0.0 + dpo_label_smoothing ............................. 0.0 + dpo_loss_type ................................... sigmoid + embed_layernorm ................................. False + embedding_multiplier_scale ...................... 1.0 + embedding_path .................................. None + empty_unused_memory_level ....................... 0 + enable_chunk_memory ............................. False + enable_chunk_sequence ........................... False + enable_hbmfault_repair .......................... False + enable_high_availability ........................ False + enable_one_logger ............................... False + enable_optimizer_state_local_copy ............... False + enable_recompute_layers_per_pp_rank ............. False + enable_token_rearrange_opt ...................... False + encoder_num_layers .............................. 48 + encoder_seq_length .............................. 131072 + end_weight_decay ................................ 0.0 + eod_mask_loss ................................... False + eval_interval ................................... 100 + eval_iters ...................................... 0 + evidence_data_path .............................. None + exit_duration_in_mins ........................... None + exit_interval ................................... None + exit_on_missing_checkpoint ...................... False + exit_signal_handler ............................. False + expert_interval ................................. 1 + expert_model_parallel_size ...................... 1 + ffn_hidden_size ................................. 13824 + fill_neg_inf .................................... False + finetune ........................................ True + first_k_dense_replace ........................... None + first_pipeline_num_layers ....................... 0 + fp16 ............................................ False + fp16_lm_cross_entropy ........................... False + fp32_residual_connection ........................ False + fp8 ............................................. None + fp8_amax_compute_algo ........................... most_recent + fp8_amax_history_len ............................ 1 + fp8_interval .................................... 1 + fp8_margin ...................................... 0 + fp8_wgrad ....................................... True + full_shuffle_instruction_dataset ................ False + geglu ........................................... False + gelu_tanh ....................................... False + global_batch_size ............................... 64 + gradient_accumulation_fusion .................... False + group_query_attention ........................... True + head_lr_mult .................................... 1.0 + hidden_dropout .................................. 0.0 + hidden_size ..................................... 5120 + high_freq_factor ................................ None + hysteresis ...................................... 2 + ict_head_size ................................... None + ict_load ........................................ None + image_size ...................................... 448 + image_token_length .............................. 256 + img_h ........................................... 224 + img_w ........................................... 224 + independent_parallel ............................ False + indexer_batch_size .............................. 128 + indexer_log_interval ............................ 1000 + inference_batch_times_seqlen_threshold .......... 512 + init_method_std ................................. 0.01 + init_method_xavier_uniform ...................... False + initial_loss_scale .............................. 4096.0 + input_embeds_norm ............................... False + input_jitter .................................... True + input_layernorm_in_fp32 ......................... False + interleave_sliding_window ....................... None + is_instruction_dataset .......................... True + is_pairwise_dataset ............................. False + iter_per_epoch .................................. 1250 + jit_compile ..................................... False + kv_channels ..................................... 128 + kv_head_repeat_before_uly_alltoall .............. True + kv_lora_rank .................................... None + language_model_freeze ........................... False + lazy_mpu_init ................................... None + load ............................................ /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ + load_checkpoint_loosely ......................... False + local_rank ...................................... None + log_batch_size_to_tensorboard ................... False + log_interval .................................... 1 + log_learning_rate_to_tensorboard ................ True + log_loss_scale_to_tensorboard ................... True + log_memory_to_tensorboard ....................... False + log_num_zeros_in_grad ........................... False + log_params_norm ................................. False + log_progress .................................... False + log_throughput .................................. True + log_timers_to_tensorboard ....................... False + log_validation_ppl_to_tensorboard ............... False + log_world_size_to_tensorboard ................... False + logit_mask ...................................... False + lora_alpha ...................................... 32 + lora_fusion ..................................... False + lora_load ....................................... None + lora_modules_to_save ............................ None + lora_r .......................................... 16 + lora_register_forward_hook ...................... ['word_embeddings', 'input_layernorm'] + lora_target_modules ............................. [] + loss_scale ...................................... None + loss_scale_window ............................... 1000 + low_freq_factor ................................. None + lr .............................................. 5e-06 + lr_decay_iters .................................. None + lr_decay_samples ................................ None + lr_decay_style .................................. cosine + lr_warmup_fraction .............................. 0.03 + lr_warmup_init .................................. 0.0 + lr_warmup_iters ................................. 0 + lr_warmup_samples ............................... 0 + make_vocab_size_divisible_by .................... 1 + manual_gc ....................................... False + manual_gc_eval .................................. True + manual_gc_interval .............................. 0 + mask_factor ..................................... 1.0 + mask_prob ....................................... 0.15 + mask_type ....................................... random + masked_softmax_fusion ........................... False + max_fps ......................................... 1 + max_num_frame ................................... 512 + max_num_image ................................... 8 + max_patch_grid .................................. 12 + max_position_embeddings ......................... 131072 + max_tokens_to_oom ............................... 12000 + merge_file ...................................... None + micro_batch_size ................................ 1 + min_loss_scale .................................. 1.0 + min_lr .......................................... 1e-07 + min_patch_grid .................................. 1 + mmap_bin_files .................................. True + mock_data ....................................... False + moe_allgather_overlap_comm ...................... False + moe_alltoall_overlap_comm ....................... False + moe_aux_loss_coeff .............................. 0.0 + moe_comm_aux_loss_coeff ......................... 0.0 + moe_device_level_aux_loss_coeff ................. 0.0 + moe_expert_capacity_factor ...................... None + moe_grouped_gemm ................................ False + moe_input_jitter_eps ............................ None + moe_intermediate_size ........................... None + moe_layer_freq .................................. None + moe_pad_expert_input_to_capacity ................ False + moe_per_layer_logging ........................... False + moe_permutation_async_comm ...................... False + moe_router_load_balancing_type .................. aux_loss + moe_router_topk ................................. 2 + moe_token_dispatcher_type ....................... allgather + moe_token_drop_policy ........................... probs + moe_token_dropping .............................. False + moe_tp_extend_ep ................................ False + moe_train_capacity_factor ....................... 1.0 + moe_without_activation .......................... False + moe_z_loss_coeff ................................ 0.0 + moe_zero_memory ................................. disable + multi_head_latent_attention ..................... False + n_shared_experts ................................ None + nccl_communicator_config_path ................... None + next_tockens .................................... 0 + no_load_optim ................................... True + no_load_rng ..................................... True + no_persist_layer_norm ........................... False + no_post_layer_norm .............................. False + no_save_optim ................................... None + no_save_rng ..................................... None + no_shared_storage ............................... False + no_shuffle ...................................... False + noisy_gate_policy ............................... None + noop_layers ..................................... None + norm_epsilon .................................... 1e-06 + norm_topk_prob .................................. False + normalization ................................... RMSNorm + num_attention_heads ............................. 40 + num_channels .................................... 3 + num_classes ..................................... 1000 + num_experts ..................................... None + num_layer_list .................................. None + num_layers ...................................... 48 + num_layers_per_virtual_pipeline_stage ........... None + num_query_groups ................................ 8 + num_workers ..................................... 8 + one_logger_entity ............................... hwinf_dcm + one_logger_project .............................. e2e-tracking + one_logger_run_name ............................. None + onnx_safe ....................................... None + openai_gelu ..................................... False + optimizer ....................................... adam + original_max_position_embeddings ................ None + output_bert_embeddings .......................... False + output_layer_slice_num .......................... 1 + output_logit_softcapping ........................ None + output_multiplier_scale ......................... None + overlap_grad_reduce ............................. True + overlap_p2p_comm ................................ False + overlap_param_gather ............................ False + override_opt_param_scheduler .................... False + pad_to_multiple_of .............................. 8 + padded_vocab_size ............................... 152064 + params_dtype .................................... torch.bfloat16 + patch_dim ....................................... 16 + perform_initialization .......................... True + pipeline_model_parallel_size .................... 1 + pipeline_model_parallel_split_rank .............. None + position_embedding_type ......................... rope + post_norm ....................................... False + pre_tockens ..................................... 65536 + pref_ftx ........................................ 0.0 + pretrained_checkpoint ........................... None + profile ......................................... False + profile_level ................................... level0 + profile_ranks ................................... [-1] + profile_record_shapes ........................... False + profile_save_path ............................... ./profile_dir + profile_step_end ................................ 12 + profile_step_start .............................. 10 + profile_with_cpu ................................ False + profile_with_memory ............................. False + profile_with_stack .............................. False + prompt_format ................................... qwen2 + prompt_type ..................................... None + q_lora_rank ..................................... None + qk_layernorm .................................... False + qk_nope_head_dim ................................ None + qk_rope_head_dim ................................ None + query_in_block_prob ............................. 0.1 + query_pre_attn_scalar ........................... None + rampup_batch_size ............................... None + rank ............................................ 0 + recompute_activation_function ................... False + recompute_activation_function_num_layers ........ None + recompute_granularity ........................... full + recompute_in_advance ............................ False + recompute_in_bubble ............................. False + recompute_method ................................ block + recompute_num_layers ............................ 48 + reduce_recompute_for_last_chunk ................. False + ref_model ....................................... None + reset_attention_mask ............................ False + reset_position_ids .............................. False + retriever_report_topk_accuracies ................ [] + retriever_score_scaling ......................... False + retriever_seq_length ............................ 256 + retro_add_retriever ............................. False + retro_attention_gate ............................ 1 + retro_cyclic_train_iters ........................ None + retro_encoder_attention_dropout ................. 0.1 + retro_encoder_hidden_dropout .................... 0.1 + retro_encoder_layers ............................ 2 + retro_num_neighbors ............................. 2 + retro_num_retrieved_chunks ...................... 2 + retro_project_dir ............................... None + retro_verify_neighbor_count ..................... True + reuse_fp32_param ................................ False + rope_scaling_beta_fast .......................... 32 + rope_scaling_beta_slow .......................... 1 + rope_scaling_factor ............................. 1.0 + rope_scaling_mscale ............................. 1.0 + rope_scaling_mscale_all_dim ..................... 0.0 + rope_scaling_original_max_position_embeddings ... None + rope_scaling_type ............................... None + rotary_base ..................................... 1000000.0 + rotary_interleaved .............................. False + rotary_percent .................................. 1.0 + rotary_seq_len_interpolation_factor ............. None + routed_scaling_factor ........................... None + sample_rate ..................................... 1.0 + save ............................................ /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// + save_interval ................................... 100 + scale_depth ..................................... None + scale_emb ....................................... None + scatter_gather_tensors_in_pipeline .............. True + seed ............................................ 424242 + seq_aux ......................................... False + seq_length ...................................... 131072 + sequence_parallel ............................... True + sgd_momentum .................................... 0.9 + shape_order ..................................... SBH + shared_expert_gate .............................. False + shared_expert_gate_output_dimension ............. 1 + short_seq_prob .................................. 0.1 + skip_bias_add ................................... True + skip_train ...................................... False + sliding_window .................................. None + sparse_mode ..................................... 4 + spec ............................................ None + split ........................................... 100,0,0 + square_alibi_mask ............................... False + squared_relu .................................... False + stage ........................................... None + standalone_embedding_stage ...................... False + start_weight_decay .............................. 0.0 + swap_attention .................................. False + swap_modules .................................... None + swiglu .......................................... True + swin_backbone_type .............................. tiny + tensor_model_parallel_size ...................... 8 + tensorboard_dir ................................. None + tensorboard_log_interval ........................ 1 + tensorboard_queue_size .......................... 1000 + test_data_path .................................. None + test_mode ....................................... False + timing_log_level ................................ 0 + timing_log_option ............................... minmax + titles_data_path ................................ None + tokenizer_kwargs ................................ None + tokenizer_model ................................. None + tokenizer_name_or_path .......................... /data_4/models/Qwen/Qwen2.5-14B-Instruct/ + tokenizer_not_use_fast .......................... True + tokenizer_padding_side .......................... right + tokenizer_type .................................. PretrainedFromHF + topk_group ...................................... None + tp_2d ........................................... False + tp_comm_bulk_dgrad .............................. True + tp_comm_bulk_wgrad .............................. True + tp_comm_overlap ................................. False + tp_comm_overlap_ag .............................. True + tp_comm_overlap_cfg ............................. None + tp_comm_overlap_rs .............................. True + tp_comm_split_ag ................................ True + tp_comm_split_rs ................................ True + tp_x ............................................ 1 + tp_y ............................................ 1 + train_data_path ................................. None + train_iters ..................................... 1000 + train_samples ................................... None + transformer_impl ................................ local + transformer_pipeline_model_parallel_size ........ 1 + ulysses_degree_in_cp ............................ None + untie_embeddings_and_output_weights ............. True + use_checkpoint_args ............................. False + use_checkpoint_opt_param_scheduler .............. False + use_cp_send_recv_overlap ........................ True + use_cpu_initialization .......................... None + use_deter_comp .................................. False + use_dist_ckpt ................................... False + use_distributed_optimizer ....................... True + use_flash_attn .................................. True + use_fused_moe_token_permute_and_unpermute ....... False + use_fused_ring_attention_update ................. False + use_fused_rmsnorm ............................... True + use_fused_rotary_pos_emb ........................ True + use_fused_swiglu ................................ True + use_glm_rope .................................... False + use_mc2 ......................................... False + use_mcore_models ................................ True + use_one_sent_docs ............................... False + use_ring_exchange_p2p ........................... False + use_rotary_position_embeddings .................. True + v_head_dim ...................................... None + valid_data_path ................................. None + variable_seq_lengths ............................ False + virtual_pipeline_model_parallel_size ............ None + vision_backbone_type ............................ vit + vision_context_parallel ......................... False + vision_downsample_ratio ......................... 0.5 + vision_downsample_stride ........................ 1.0 + vision_model_freeze ............................. True + vision_model_lr_decay_rate ...................... 1.0 + vision_model_lr_mult ............................ 1.0 + vision_model_recompute .......................... False + vision_model_type ............................... intern_300m + vision_normalize_type ........................... imagenet + vision_pretraining .............................. False + vision_pretraining_type ......................... classify + vision_process_type ............................. dynamic + vision_projector_freeze ......................... False + vision_projector_pre_norm ....................... True + vision_projector_recompute ...................... False + vision_projector_type ........................... mlp + vision_seq_length ............................... 1025 + vit_load ........................................ / + vocab_extra_ids ................................. 0 + vocab_file ...................................... None + vocab_size ...................................... None + wandb_exp_name .................................. + wandb_project ................................... + wandb_save_dir .................................. + weight_decay .................................... 0.0 + weight_decay_incr_style ......................... constant + world_size ...................................... 512 + yaml_cfg ........................................ None +-------------------- end of ModelLink Arguments --------------------- +setting number of micro-batches to constant 2 + > building PretrainFromHF tokenizer. Vocab file is un-used, loading tokenizer from pre-trained model +> initializing torch distributed ... +[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) +[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) +[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) +[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) +[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) +all tp groups [[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55], [56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 67, 68, 69, 70, 71], [72, 73, 74, 75, 76, 77, 78, 79], [80, 81, 82, 83, 84, 85, 86, 87], [88, 89, 90, 91, 92, 93, 94, 95], [96, 97, 98, 99, 100, 101, 102, 103], [104, 105, 106, 107, 108, 109, 110, 111], [112, 113, 114, 115, 116, 117, 118, 119], [120, 121, 122, 123, 124, 125, 126, 127], [128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143], [144, 145, 146, 147, 148, 149, 150, 151], [152, 153, 154, 155, 156, 157, 158, 159], [160, 161, 162, 163, 164, 165, 166, 167], [168, 169, 170, 171, 172, 173, 174, 175], [176, 177, 178, 179, 180, 181, 182, 183], [184, 185, 186, 187, 188, 189, 190, 191], [192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207], [208, 209, 210, 211, 212, 213, 214, 215], [216, 217, 218, 219, 220, 221, 222, 223], [224, 225, 226, 227, 228, 229, 230, 231], [232, 233, 234, 235, 236, 237, 238, 239], [240, 241, 242, 243, 244, 245, 246, 247], [248, 249, 250, 251, 252, 253, 254, 255], [256, 257, 258, 259, 260, 261, 262, 263], [264, 265, 266, 267, 268, 269, 270, 271], [272, 273, 274, 275, 276, 277, 278, 279], [280, 281, 282, 283, 284, 285, 286, 287], [288, 289, 290, 291, 292, 293, 294, 295], [296, 297, 298, 299, 300, 301, 302, 303], [304, 305, 306, 307, 308, 309, 310, 311], [312, 313, 314, 315, 316, 317, 318, 319], [320, 321, 322, 323, 324, 325, 326, 327], [328, 329, 330, 331, 332, 333, 334, 335], [336, 337, 338, 339, 340, 341, 342, 343], [344, 345, 346, 347, 348, 349, 350, 351], [352, 353, 354, 355, 356, 357, 358, 359], [360, 361, 362, 363, 364, 365, 366, 367], [368, 369, 370, 371, 372, 373, 374, 375], [376, 377, 378, 379, 380, 381, 382, 383], [384, 385, 386, 387, 388, 389, 390, 391], [392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 402, 403, 404, 405, 406, 407], [408, 409, 410, 411, 412, 413, 414, 415], [416, 417, 418, 419, 420, 421, 422, 423], [424, 425, 426, 427, 428, 429, 430, 431], [432, 433, 434, 435, 436, 437, 438, 439], [440, 441, 442, 443, 444, 445, 446, 447], [448, 449, 450, 451, 452, 453, 454, 455], [456, 457, 458, 459, 460, 461, 462, 463], [464, 465, 466, 467, 468, 469, 470, 471], [472, 473, 474, 475, 476, 477, 478, 479], [480, 481, 482, 483, 484, 485, 486, 487], [488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 501, 502, 503], [504, 505, 506, 507, 508, 509, 510, 511]] +all ep groups [[0], [8], [16], [24], [32], [40], [48], [56], [64], [72], [80], [88], [96], [104], [112], [120], [128], [136], [144], [152], [160], [168], [176], [184], [192], [200], [208], [216], [224], [232], [240], [248], [256], [264], [272], [280], [288], [296], [304], [312], [320], [328], [336], [344], [352], [360], [368], [376], [384], [392], [400], [408], [416], [424], [432], [440], [448], [456], [464], [472], [480], [488], [496], [504], [1], [9], [17], [25], [33], [41], [49], [57], [65], [73], [81], [89], [97], [105], [113], [121], [129], [137], [145], [153], [161], [169], [177], [185], [193], [201], [209], [217], [225], [233], [241], [249], [257], [265], [273], [281], [289], [297], [305], [313], [321], [329], [337], [345], [353], [361], [369], [377], [385], [393], [401], [409], [417], [425], [433], [441], [449], [457], [465], [473], [481], [489], [497], [505], [2], [10], [18], [26], [34], [42], [50], [58], [66], [74], [82], [90], [98], [106], [114], [122], [130], [138], [146], [154], [162], [170], [178], [186], [194], [202], [210], [218], [226], [234], [242], [250], [258], [266], [274], [282], [290], [298], [306], [314], [322], [330], [338], [346], [354], [362], [370], [378], [386], [394], [402], [410], [418], [426], [434], [442], [450], [458], [466], [474], [482], [490], [498], [506], [3], [11], [19], [27], [35], [43], [51], [59], [67], [75], [83], [91], [99], [107], [115], [123], [131], [139], [147], [155], [163], [171], [179], [187], [195], [203], [211], [219], [227], [235], [243], [251], [259], [267], [275], [283], [291], [299], [307], [315], [323], [331], [339], [347], [355], [363], [371], [379], [387], [395], [403], [411], [419], [427], [435], [443], [451], [459], [467], [475], [483], [491], [499], [507], [4], [12], [20], [28], [36], [44], [52], [60], [68], [76], [84], [92], [100], [108], [116], [124], [132], [140], [148], [156], [164], [172], [180], [188], [196], [204], [212], [220], [228], [236], [244], [252], [260], [268], [276], [284], [292], [300], [308], [316], [324], [332], [340], [348], [356], [364], [372], [380], [388], [396], [404], [412], [420], [428], [436], [444], [452], [460], [468], [476], [484], [492], [500], [508], [5], [13], [21], [29], [37], [45], [53], [61], [69], [77], [85], [93], [101], [109], [117], [125], [133], [141], [149], [157], [165], [173], [181], [189], [197], [205], [213], [221], [229], [237], [245], [253], [261], [269], [277], [285], [293], [301], [309], [317], [325], [333], [341], [349], [357], [365], [373], [381], [389], [397], [405], [413], [421], [429], [437], [445], [453], [461], [469], [477], [485], [493], [501], [509], [6], [14], [22], [30], [38], [46], [54], [62], [70], [78], [86], [94], [102], [110], [118], [126], [134], [142], [150], [158], [166], [174], [182], [190], [198], [206], [214], [222], [230], [238], [246], [254], [262], [270], [278], [286], [294], [302], [310], [318], [326], [334], [342], [350], [358], [366], [374], [382], [390], [398], [406], [414], [422], [430], [438], [446], [454], [462], [470], [478], [486], [494], [502], [510], [7], [15], [23], [31], [39], [47], [55], [63], [71], [79], [87], [95], [103], [111], [119], [127], [135], [143], [151], [159], [167], [175], [183], [191], [199], [207], [215], [223], [231], [239], [247], [255], [263], [271], [279], [287], [295], [303], [311], [319], [327], [335], [343], [351], [359], [367], [375], [383], [391], [399], [407], [415], [423], [431], [439], [447], [455], [463], [471], [479], [487], [495], [503], [511]] +all dp groups [[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496], [1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, 449, 465, 481, 497], [2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, 418, 434, 450, 466, 482, 498], [3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 403, 419, 435, 451, 467, 483, 499], [4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 404, 420, 436, 452, 468, 484, 500], [5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501], [6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502], [7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, 503], [8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, 472, 488, 504], [9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, 441, 457, 473, 489, 505], [10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, 410, 426, 442, 458, 474, 490, 506], [11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, 411, 427, 443, 459, 475, 491, 507], [12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508], [13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509], [14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510], [15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, 495, 511]] +all_dp_modulo_exp_group_ranks [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504], [1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 257, 265, 273, 281, 289, 297, 305, 313, 321, 329, 337, 345, 353, 361, 369, 377, 385, 393, 401, 409, 417, 425, 433, 441, 449, 457, 465, 473, 481, 489, 497, 505], [2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 258, 266, 274, 282, 290, 298, 306, 314, 322, 330, 338, 346, 354, 362, 370, 378, 386, 394, 402, 410, 418, 426, 434, 442, 450, 458, 466, 474, 482, 490, 498, 506], [3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 259, 267, 275, 283, 291, 299, 307, 315, 323, 331, 339, 347, 355, 363, 371, 379, 387, 395, 403, 411, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507], [4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 260, 268, 276, 284, 292, 300, 308, 316, 324, 332, 340, 348, 356, 364, 372, 380, 388, 396, 404, 412, 420, 428, 436, 444, 452, 460, 468, 476, 484, 492, 500, 508], [5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 261, 269, 277, 285, 293, 301, 309, 317, 325, 333, 341, 349, 357, 365, 373, 381, 389, 397, 405, 413, 421, 429, 437, 445, 453, 461, 469, 477, 485, 493, 501, 509], [6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 270, 278, 286, 294, 302, 310, 318, 326, 334, 342, 350, 358, 366, 374, 382, 390, 398, 406, 414, 422, 430, 438, 446, 454, 462, 470, 478, 486, 494, 502, 510], [7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, 263, 271, 279, 287, 295, 303, 311, 319, 327, 335, 343, 351, 359, 367, 375, 383, 391, 399, 407, 415, 423, 431, 439, 447, 455, 463, 471, 479, 487, 495, 503, 511]] +all_tensor_and_expert_group_ranks [[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55], [56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 67, 68, 69, 70, 71], [72, 73, 74, 75, 76, 77, 78, 79], [80, 81, 82, 83, 84, 85, 86, 87], [88, 89, 90, 91, 92, 93, 94, 95], [96, 97, 98, 99, 100, 101, 102, 103], [104, 105, 106, 107, 108, 109, 110, 111], [112, 113, 114, 115, 116, 117, 118, 119], [120, 121, 122, 123, 124, 125, 126, 127], [128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143], [144, 145, 146, 147, 148, 149, 150, 151], [152, 153, 154, 155, 156, 157, 158, 159], [160, 161, 162, 163, 164, 165, 166, 167], [168, 169, 170, 171, 172, 173, 174, 175], [176, 177, 178, 179, 180, 181, 182, 183], [184, 185, 186, 187, 188, 189, 190, 191], [192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207], [208, 209, 210, 211, 212, 213, 214, 215], [216, 217, 218, 219, 220, 221, 222, 223], [224, 225, 226, 227, 228, 229, 230, 231], [232, 233, 234, 235, 236, 237, 238, 239], [240, 241, 242, 243, 244, 245, 246, 247], [248, 249, 250, 251, 252, 253, 254, 255], [256, 257, 258, 259, 260, 261, 262, 263], [264, 265, 266, 267, 268, 269, 270, 271], [272, 273, 274, 275, 276, 277, 278, 279], [280, 281, 282, 283, 284, 285, 286, 287], [288, 289, 290, 291, 292, 293, 294, 295], [296, 297, 298, 299, 300, 301, 302, 303], [304, 305, 306, 307, 308, 309, 310, 311], [312, 313, 314, 315, 316, 317, 318, 319], [320, 321, 322, 323, 324, 325, 326, 327], [328, 329, 330, 331, 332, 333, 334, 335], [336, 337, 338, 339, 340, 341, 342, 343], [344, 345, 346, 347, 348, 349, 350, 351], [352, 353, 354, 355, 356, 357, 358, 359], [360, 361, 362, 363, 364, 365, 366, 367], [368, 369, 370, 371, 372, 373, 374, 375], [376, 377, 378, 379, 380, 381, 382, 383], [384, 385, 386, 387, 388, 389, 390, 391], [392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 402, 403, 404, 405, 406, 407], [408, 409, 410, 411, 412, 413, 414, 415], [416, 417, 418, 419, 420, 421, 422, 423], [424, 425, 426, 427, 428, 429, 430, 431], [432, 433, 434, 435, 436, 437, 438, 439], [440, 441, 442, 443, 444, 445, 446, 447], [448, 449, 450, 451, 452, 453, 454, 455], [456, 457, 458, 459, 460, 461, 462, 463], [464, 465, 466, 467, 468, 469, 470, 471], [472, 473, 474, 475, 476, 477, 478, 479], [480, 481, 482, 483, 484, 485, 486, 487], [488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 501, 502, 503], [504, 505, 506, 507, 508, 509, 510, 511]] +all_data_parallel_group_ranks_with_cp [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504], [1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 257, 265, 273, 281, 289, 297, 305, 313, 321, 329, 337, 345, 353, 361, 369, 377, 385, 393, 401, 409, 417, 425, 433, 441, 449, 457, 465, 473, 481, 489, 497, 505], [2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 258, 266, 274, 282, 290, 298, 306, 314, 322, 330, 338, 346, 354, 362, 370, 378, 386, 394, 402, 410, 418, 426, 434, 442, 450, 458, 466, 474, 482, 490, 498, 506], [3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 259, 267, 275, 283, 291, 299, 307, 315, 323, 331, 339, 347, 355, 363, 371, 379, 387, 395, 403, 411, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507], [4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 260, 268, 276, 284, 292, 300, 308, 316, 324, 332, 340, 348, 356, 364, 372, 380, 388, 396, 404, 412, 420, 428, 436, 444, 452, 460, 468, 476, 484, 492, 500, 508], [5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 261, 269, 277, 285, 293, 301, 309, 317, 325, 333, 341, 349, 357, 365, 373, 381, 389, 397, 405, 413, 421, 429, 437, 445, 453, 461, 469, 477, 485, 493, 501, 509], [6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 270, 278, 286, 294, 302, 310, 318, 326, 334, 342, 350, 358, 366, 374, 382, 390, 398, 406, 414, 422, 430, 438, 446, 454, 462, 470, 478, 486, 494, 502, 510], [7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, 263, 271, 279, 287, 295, 303, 311, 319, 327, 335, 343, 351, 359, 367, 375, 383, 391, 399, 407, 415, 423, 431, 439, 447, 455, 463, 471, 479, 487, 495, 503, 511]] +> initialized tensor model parallel with size 8 +> initialized pipeline model parallel with size 1 +> setting random seeds to 424242 ... +> compiling dataset index builder ... +make: Entering directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets' +make: Nothing to be done for 'default'. +make: Leaving directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets' +>>> done with dataset index builder. Compilation time: 0.074 seconds +time to initialize megatron (seconds): -34.653 +[after megatron is initialized] datetime: 2024-11-27 12:45:17 +building GPT model ... +Building megatron mcore vision language model ... +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute Falsevision_projector_recompute Falsevision_projector_recompute False + + +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +model_provider args Namespace(accumulate_allreduce_grads_in_fp32=True, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-08, adaptive_cp_dynamic_attn_mask=False, adaptive_cp_manually_set_mask_list=False, adaptive_cp_only_reschedule=False, adaptive_cp_without_coarse=False, adaptive_recompute_device_size=-1, adaptive_recompute_device_swap=False, adaptive_recompute_profiling_step=10, add_bias_linear=False, add_class_token=True, add_dense_bias=False, add_position_embedding=True, add_qkv_bias=True, add_rmsnorm_offset=False, adlr_autoresume=False, adlr_autoresume_interval=1000, apply_layernorm_1p=False, apply_query_key_layer_scaling=False, apply_residual_connection_post_layernorm=False, apply_rope_fusion=True, async_tensor_model_parallel_allreduce=False, attention_dropout=0.0, attention_mask_on_cpu=False, attention_softmax_in_fp32=True, attn_logit_softcapping=None, auto_detect_ckpt_format=False, barrier_with_L1_time=True, bert_binary_head=True, bert_embedder_type='megatron', bert_load=None, bf16=True, bias_dropout_fusion=True, bias_gelu_fusion=False, bias_swiglu_fusion=True, biencoder_projection_dim=0, biencoder_shared_query_context_model=False, block_data_path=None, check_for_nan_in_loss_and_grad=True, chunk_size=4096, ckpt_fully_parallel_save=False, ckpt_step=None, classes_fraction=1.0, clip_grad=1.0, clone_scatter_output_in_embedding=True, consumed_train_samples=0, consumed_valid_samples=0, context_parallel_algo='megatron_cp_algo', context_parallel_size=2, cp_attention_mask_type='causal', cp_window_size=1, create_attention_mask_in_dataloader=False, cross_dataset_joint=True, data_cache_path=None, data_parallel_random_init=False, data_parallel_size=32, data_path=['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml'], data_per_class_fraction=1.0, data_seq_length=131072, data_sharding=True, dataloader_type='single', decoder_num_layers=None, decoder_seq_length=None, decoupled_lr=None, decoupled_min_lr=None, delay_grad_reduce=True, delay_param_gather=False, dim_model_base=None, dino_bottleneck_size=256, dino_freeze_last_layer=1, dino_head_hidden_size=2048, dino_local_crops_number=10, dino_local_img_size=96, dino_norm_last_layer=False, dino_teacher_temp=0.07, dino_warmup_teacher_temp=0.04, dino_warmup_teacher_temp_epochs=30, dist_ckpt_format='torch_dist', distribute_saved_activations=False, distributed_backend='nccl', distributed_timeout_minutes=120, dpo_beta=0.1, dpo_ftx=0.0, dpo_label_smoothing=0.0, dpo_loss_type='sigmoid', embed_layernorm=False, embedding_multiplier_scale=1.0, embedding_path=None, empty_unused_memory_level=0, enable_chunk_memory=False, enable_chunk_sequence=False, enable_hbmfault_repair=False, enable_high_availability=False, enable_one_logger=False, enable_optimizer_state_local_copy=False, enable_recompute_layers_per_pp_rank=False, enable_token_rearrange_opt=False, encoder_num_layers=48, encoder_seq_length=131072, end_weight_decay=0.0, eod_mask_loss=False, eval_interval=100, eval_iters=0, evidence_data_path=None, exit_duration_in_mins=None, exit_interval=None, exit_on_missing_checkpoint=False, exit_signal_handler=False, expert_interval=1, expert_model_parallel_size=1, ffn_hidden_size=13824, fill_neg_inf=False, finetune=True, first_k_dense_replace=None, first_pipeline_num_layers=0, fp16=False, fp16_lm_cross_entropy=False, fp32_residual_connection=False, fp8=None, fp8_amax_compute_algo='most_recent', fp8_amax_history_len=1, fp8_interval=1, fp8_margin=0, fp8_wgrad=True, full_shuffle_instruction_dataset=False, geglu=False, gelu_tanh=False, global_batch_size=64, gradient_accumulation_fusion=False, group_query_attention=True, head_lr_mult=1.0, hidden_dropout=0.0, hidden_size=5120, high_freq_factor=None, hysteresis=2, ict_head_size=None, ict_load=None, image_size=448, image_token_length=256, img_h=224, img_w=224, independent_parallel=False, indexer_batch_size=128, indexer_log_interval=1000, inference_batch_times_seqlen_threshold=512, init_method_std=0.01, init_method_xavier_uniform=False, initial_loss_scale=4096.0, input_embeds_norm=False, input_jitter=True, input_layernorm_in_fp32=False, interleave_sliding_window=None, is_instruction_dataset=True, is_pairwise_dataset=False, iter_per_epoch=1250, jit_compile=False, kv_channels=128, kv_head_repeat_before_uly_alltoall=True, kv_lora_rank=None, language_model_freeze=False, lazy_mpu_init=None, load='/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/', load_checkpoint_loosely=False, local_rank=0, log_batch_size_to_tensorboard=False, log_interval=1, log_learning_rate_to_tensorboard=True, log_loss_scale_to_tensorboard=True, log_memory_to_tensorboard=False, log_num_zeros_in_grad=False, log_params_norm=False, log_progress=False, log_throughput=True, log_timers_to_tensorboard=False, log_validation_ppl_to_tensorboard=False, log_world_size_to_tensorboard=False, logit_mask=False, lora_alpha=32, lora_fusion=False, lora_load=None, lora_modules_to_save=None, lora_r=16, lora_register_forward_hook=['word_embeddings', 'input_layernorm'], lora_target_modules=[], loss_scale=None, loss_scale_window=1000, low_freq_factor=None, lr=5e-06, lr_decay_iters=None, lr_decay_samples=None, lr_decay_style='cosine', lr_warmup_fraction=0.03, lr_warmup_init=0.0, lr_warmup_iters=0, lr_warmup_samples=0, make_vocab_size_divisible_by=1, manual_gc=False, manual_gc_eval=True, manual_gc_interval=0, mask_factor=1.0, mask_prob=0.15, mask_type='random', masked_softmax_fusion=False, max_fps=1, max_num_frame=512, max_num_image=8, max_patch_grid=12, max_position_embeddings=131072, max_tokens_to_oom=12000, merge_file=None, micro_batch_size=1, min_loss_scale=1.0, min_lr=1e-07, min_patch_grid=1, mmap_bin_files=True, mock_data=False, model_type=, moe_allgather_overlap_comm=False, moe_alltoall_overlap_comm=False, moe_aux_loss_coeff=0.0, moe_comm_aux_loss_coeff=0.0, moe_device_level_aux_loss_coeff=0.0, moe_expert_capacity_factor=None, moe_grouped_gemm=False, moe_input_jitter_eps=None, moe_intermediate_size=None, moe_layer_freq=None, moe_pad_expert_input_to_capacity=False, moe_per_layer_logging=False, moe_permutation_async_comm=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_token_dispatcher_type='allgather', moe_token_drop_policy='probs', moe_token_dropping=False, moe_tp_extend_ep=False, moe_train_capacity_factor=1.0, moe_without_activation=False, moe_z_loss_coeff=0.0, moe_zero_memory='disable', multi_head_latent_attention=False, n_shared_experts=None, nccl_communicator_config_path=None, next_tockens=0, no_load_optim=True, no_load_rng=True, no_persist_layer_norm=False, no_post_layer_norm=False, no_save_optim=None, no_save_rng=None, no_shared_storage=False, no_shuffle=False, noisy_gate_policy=None, noop_layers=None, norm_epsilon=1e-06, norm_topk_prob=False, normalization='RMSNorm', num_attention_heads=40, num_channels=3, num_classes=1000, num_experts=None, num_layer_list=None, num_layers=48, num_layers_per_virtual_pipeline_stage=None, num_query_groups=8, num_workers=8, one_logger_entity='hwinf_dcm', one_logger_project='e2e-tracking', one_logger_run_name=None, onnx_safe=None, openai_gelu=False, optimizer='adam', original_max_position_embeddings=None, output_bert_embeddings=False, output_layer_slice_num=1, output_logit_softcapping=None, output_multiplier_scale=None, overlap_grad_reduce=True, overlap_p2p_comm=False, overlap_param_gather=False, override_opt_param_scheduler=False, pad_to_multiple_of=8, padded_vocab_size=152064, params_dtype=torch.bfloat16, patch_dim=16, perform_initialization=True, pipeline_model_parallel_size=1, pipeline_model_parallel_split_rank=None, position_embedding_type='rope', post_norm=False, pre_tockens=65536, pref_ftx=0.0, pretrained_checkpoint=None, profile=False, profile_level='level0', profile_ranks=[-1], profile_record_shapes=False, profile_save_path='./profile_dir', profile_step_end=12, profile_step_start=10, profile_with_cpu=False, profile_with_memory=False, profile_with_stack=False, prompt_format='qwen2', prompt_type=None, q_lora_rank=None, qk_layernorm=False, qk_nope_head_dim=None, qk_rope_head_dim=None, query_in_block_prob=0.1, query_pre_attn_scalar=None, rampup_batch_size=None, rank=0, recompute_activation_function=False, recompute_activation_function_num_layers=None, recompute_granularity='full', recompute_in_advance=False, recompute_in_bubble=False, recompute_method='block', recompute_num_layers=48, reduce_recompute_for_last_chunk=False, ref_model=None, reset_attention_mask=False, reset_position_ids=False, retriever_report_topk_accuracies=[], retriever_score_scaling=False, retriever_seq_length=256, retro_add_retriever=False, retro_attention_gate=1, retro_cyclic_train_iters=None, retro_encoder_attention_dropout=0.1, retro_encoder_hidden_dropout=0.1, retro_encoder_layers=2, retro_num_neighbors=2, retro_num_retrieved_chunks=2, retro_project_dir=None, retro_verify_neighbor_count=True, reuse_fp32_param=False, rope_scaling_beta_fast=32, rope_scaling_beta_slow=1, rope_scaling_factor=1.0, rope_scaling_mscale=1.0, rope_scaling_mscale_all_dim=0.0, rope_scaling_original_max_position_embeddings=None, rope_scaling_type=None, rotary_base=1000000.0, rotary_interleaved=False, rotary_percent=1.0, rotary_seq_len_interpolation_factor=None, routed_scaling_factor=None, sample_rate=1.0, save='/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//', save_interval=100, scale_depth=None, scale_emb=None, scatter_gather_tensors_in_pipeline=True, seed=424242, seq_aux=False, seq_length=131072, sequence_parallel=True, sgd_momentum=0.9, shape_order='SBH', shared_expert_gate=False, shared_expert_gate_output_dimension=1, short_seq_prob=0.1, skip_bias_add=True, skip_train=False, sliding_window=None, sparse_mode=4, spec=None, split='100,0,0', square_alibi_mask=False, squared_relu=False, stage=None, standalone_embedding_stage=False, start_weight_decay=0.0, swap_attention=False, swap_modules=None, swiglu=True, swin_backbone_type='tiny', tensor_model_parallel_size=8, tensorboard_dir=None, tensorboard_log_interval=1, tensorboard_queue_size=1000, test_data_path=None, test_mode=False, timing_log_level=0, timing_log_option='minmax', titles_data_path=None, tokenizer_kwargs=None, tokenizer_model=None, tokenizer_name_or_path='/data_4/models/Qwen/Qwen2.5-14B-Instruct/', tokenizer_not_use_fast=True, tokenizer_padding_side='right', tokenizer_type='PretrainedFromHF', topk_group=None, tp_2d=False, tp_comm_bulk_dgrad=True, tp_comm_bulk_wgrad=True, tp_comm_overlap=False, tp_comm_overlap_ag=True, tp_comm_overlap_cfg=None, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_split_rs=True, tp_x=1, tp_y=1, train_data_path=None, train_iters=1000, train_samples=None, transformer_impl='local', transformer_pipeline_model_parallel_size=1, ulysses_degree_in_cp=None, untie_embeddings_and_output_weights=True, use_checkpoint_args=False, use_checkpoint_opt_param_scheduler=False, use_cp_send_recv_overlap=True, use_cpu_initialization=None, use_deter_comp=False, use_dist_ckpt=False, use_distributed_optimizer=True, use_flash_attn=True, use_fused_moe_token_permute_and_unpermute=False, use_fused_ring_attention_update=False, use_fused_rmsnorm=True, use_fused_rotary_pos_emb=True, use_fused_swiglu=True, use_glm_rope=False, use_mc2=False, use_mcore_models=True, use_one_sent_docs=False, use_ring_exchange_p2p=False, use_rotary_position_embeddings=True, v_head_dim=None, valid_data_path=None, variable_seq_lengths=False, virtual_pipeline_model_parallel_size=None, vision_backbone_type='vit', vision_context_parallel=False, vision_downsample_ratio=0.5, vision_downsample_stride=1.0, vision_model_freeze=True, vision_model_lr_decay_rate=1.0, vision_model_lr_mult=1.0, vision_model_recompute=False, vision_model_type='intern_300m', vision_normalize_type='imagenet', vision_pretraining=False, vision_pretraining_type='classify', vision_process_type='dynamic', vision_projector_freeze=False, vision_projector_pre_norm=True, vision_projector_recompute=False, vision_projector_type='mlp', vision_seq_length=1025, vit_load='/', vocab_extra_ids=0, vocab_file=None, vocab_size=None, wandb_exp_name='', wandb_project='', wandb_save_dir='', weight_decay=0.0, weight_decay_incr_style='constant', world_size=512, yaml_cfg=None)vision_projector_recompute False + +model_provider config TransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=2, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=48, first_pipeline_num_layers=0, independent_parallel=False, hidden_size=5120, num_attention_heads=40, num_query_groups=8, ffn_hidden_size=13824, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=True, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=False, test_mode=False, init_method=.init_ at 0x7fb2591db310>, output_layer_init_method=.init_ at 0x7fb23001b310>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=True, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=True, recompute_granularity='full', recompute_method='block', recompute_num_layers=48, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=131072, rotary_percent=1.0) +vision_projector_recompute Falsemodel_provider transformer_layer_spec ModuleSpec(module=, params={}, submodules=TransformerLayerSubmodules(input_layernorm=, self_attention=ModuleSpec(module=, params={'attn_mask_type': }, submodules=SelfAttentionSubmodules(linear_qkv=, core_attention=, linear_proj=, q_layernorm=, k_layernorm=)), post_attn_norm=, self_attn_bda=, pre_cross_attn_layernorm=, cross_attention=, cross_attn_bda=, pre_mlp_layernorm=, mlp=ModuleSpec(module=, params={}, submodules=MLPSubmodules(linear_fc1=, linear_fc2=)), post_mlp_layernorm=, mlp_bda=, sharded_state_dict_keys_map={'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_'})) + +vision_projector_recompute False +Building intern_300m model ... +------------------------ vit_args ------------------------ + accumulate_allreduce_grads_in_fp32 .............. True + activation_func ................................. + adam_beta1 ...................................... 0.9 + adam_beta2 ...................................... 0.999 + adam_eps ........................................ 1e-08 + adaptive_cp_dynamic_attn_mask ................... False + adaptive_cp_manually_set_mask_list .............. False + adaptive_cp_only_reschedule ..................... False + adaptive_cp_without_coarse ...................... False + adaptive_recompute_device_size .................. -1 + adaptive_recompute_device_swap .................. False + adaptive_recompute_profiling_step ............... 10 + add_bias_linear ................................. True + add_class_token ................................. True + add_dense_bias .................................. False + add_position_embedding .......................... True + add_qkv_bias .................................... True + add_rmsnorm_offset .............................. False + adlr_autoresume ................................. False + adlr_autoresume_interval ........................ 1000 + apply_layernorm_1p .............................. False + apply_query_key_layer_scaling ................... False + apply_residual_connection_post_layernorm ........ False + apply_rope_fusion ............................... False + async_tensor_model_parallel_allreduce ........... False + attention_dropout ............................... 0.0 + attention_mask_on_cpu ........................... False + attention_softmax_in_fp32 ....................... True + attn_logit_softcapping .......................... None + auto_detect_ckpt_format ......................... False + barrier_with_L1_time ............................ True + bert_binary_head ................................ True + bert_embedder_type .............................. megatron + bert_load ....................................... None + bf16 ............................................ True + bias_activation_fusion .......................... False + bias_dropout_fusion ............................. False + bias_gelu_fusion ................................ False + bias_swiglu_fusion .............................. True + biencoder_projection_dim ........................ 0 + biencoder_shared_query_context_model ............ False + block_data_path ................................. None + check_for_nan_in_loss_and_grad .................. True + chunk_size ...................................... 4096 + ckpt_fully_parallel_save ........................ False + ckpt_step ....................................... None + classes_fraction ................................ 1.0 + clip_grad ....................................... 1.0 + clone_scatter_output_in_embedding ............... True + consumed_train_samples .......................... 0 + consumed_valid_samples .......................... 0 + context_parallel_algo ........................... megatron_cp_algo + context_parallel_size ........................... 1 + cp_attention_mask_type .......................... causal + cp_window_size .................................. 1 + create_attention_mask_in_dataloader ............. False + cross_dataset_joint ............................. True + data_cache_path ................................. None + data_parallel_random_init ....................... False + data_parallel_size .............................. 32 + data_path ....................................... ['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml'] + data_per_class_fraction ......................... 1.0 + data_seq_length ................................. 131072 + data_sharding ................................... True + dataloader_type ................................. single + decoder_num_layers .............................. None + decoder_seq_length .............................. 1025 + decoupled_lr .................................... None + decoupled_min_lr ................................ None + delay_grad_reduce ............................... True + delay_param_gather .............................. False + dim_model_base .................................. None + dino_bottleneck_size ............................ 256 + dino_freeze_last_layer .......................... 1 + dino_head_hidden_size ........................... 2048 + dino_local_crops_number ......................... 10 + dino_local_img_size ............................. 96 + dino_norm_last_layer ............................ False + dino_teacher_temp ............................... 0.07 + dino_warmup_teacher_temp ........................ 0.04 + dino_warmup_teacher_temp_epochs ................. 30 + dist_ckpt_format ................................ torch_dist + distribute_saved_activations .................... False + distributed_backend ............................. nccl + distributed_timeout_minutes ..................... 120 + dpo_beta ........................................ 0.1 + dpo_ftx ......................................... 0.0 + dpo_label_smoothing ............................. 0.0 + dpo_loss_type ................................... sigmoid + embed_layernorm ................................. False + embedding_multiplier_scale ...................... 1.0 + embedding_path .................................. None + empty_unused_memory_level ....................... 0 + enable_chunk_memory ............................. False + enable_chunk_sequence ........................... False + enable_hbmfault_repair .......................... False + enable_high_availability ........................ False + enable_one_logger ............................... False + enable_optimizer_state_local_copy ............... False + enable_recompute_layers_per_pp_rank ............. False + enable_token_rearrange_opt ...................... False + encoder_num_layers .............................. 48 + encoder_seq_length .............................. 1025 + end_weight_decay ................................ 0.0 + eod_mask_loss ................................... False + eval_interval ................................... 100 + eval_iters ...................................... 0 + evidence_data_path .............................. None + exit_duration_in_mins ........................... None + exit_interval ................................... None + exit_on_missing_checkpoint ...................... False + exit_signal_handler ............................. False + expert_interval ................................. 1 + expert_model_parallel_size ...................... 1 + ffn_hidden_size ................................. 4096 + fill_neg_inf .................................... False + finetune ........................................ True + first_k_dense_replace ........................... None + first_pipeline_num_layers ....................... 0 + fp16 ............................................ False + fp16_lm_cross_entropy ........................... False + fp32_residual_connection ........................ False + fp8 ............................................. None + fp8_amax_compute_algo ........................... most_recent + fp8_amax_history_len ............................ 1 + fp8_interval .................................... 1 + fp8_margin ...................................... 0 + fp8_wgrad ....................................... True + full_shuffle_instruction_dataset ................ False + gated_linear_unit ............................... False + geglu ........................................... False + gelu_tanh ....................................... False + global_batch_size ............................... 64 + gradient_accumulation_fusion .................... False + group_query_attention ........................... False + head_lr_mult .................................... 1.0 + hidden_dropout .................................. 0.0 + hidden_size ..................................... 1024 + high_freq_factor ................................ None + hysteresis ...................................... 2 + ict_head_size ................................... None + ict_load ........................................ None + image_size ...................................... 448 + image_token_length .............................. 256 + img_h ........................................... 448 + img_w ........................................... 448 + independent_parallel ............................ True + indexer_batch_size .............................. 128 + indexer_log_interval ............................ 1000 + inference_batch_times_seqlen_threshold .......... 512 + init_method_std ................................. 0.01 + init_method_xavier_uniform ...................... False + initial_loss_scale .............................. 4096.0 + input_embeds_norm ............................... False + input_jitter .................................... True + input_layernorm_in_fp32 ......................... False + interleave_sliding_window ....................... None + is_instruction_dataset .......................... True + is_pairwise_dataset ............................. False + iter_per_epoch .................................. 1250 + jit_compile ..................................... False + kv_channels ..................................... 64 + kv_head_repeat_before_uly_alltoall .............. True + kv_lora_rank .................................... None + language_model_freeze ........................... False + layernorm_zero_centered_gamma ................... False + lazy_mpu_init ................................... None + load ............................................ / + load_checkpoint_loosely ......................... False + local_rank ...................................... 0 + log_batch_size_to_tensorboard ................... False + log_interval .................................... 1 + log_learning_rate_to_tensorboard ................ True + log_loss_scale_to_tensorboard ................... True + log_memory_to_tensorboard ....................... False + log_num_zeros_in_grad ........................... False + log_params_norm ................................. False + log_progress .................................... False + log_throughput .................................. True + log_timers_to_tensorboard ....................... False + log_validation_ppl_to_tensorboard ............... False + log_world_size_to_tensorboard ................... False + logit_mask ...................................... False + lora_alpha ...................................... 32 + lora_fusion ..................................... False + lora_load ....................................... None + lora_modules_to_save ............................ None + lora_r .......................................... 16 + lora_register_forward_hook ...................... ['word_embeddings', 'input_layernorm'] + lora_target_modules ............................. [] + loss_scale ...................................... None + loss_scale_window ............................... 1000 + low_freq_factor ................................. None + lr .............................................. 5e-06 + lr_decay_iters .................................. None + lr_decay_samples ................................ None + lr_decay_style .................................. cosine + lr_warmup_fraction .............................. 0.03 + lr_warmup_init .................................. 0.0 + lr_warmup_iters ................................. 0 + lr_warmup_samples ............................... 0 + make_vocab_size_divisible_by .................... 1 + manual_gc ....................................... False + manual_gc_eval .................................. True + manual_gc_interval .............................. 0 + mask_factor ..................................... 1.0 + mask_prob ....................................... 0.15 + mask_type ....................................... random + masked_softmax_fusion ........................... False + max_fps ......................................... 1 + max_num_frame ................................... 512 + max_num_image ................................... 8 + max_patch_grid .................................. 12 + max_position_embeddings ......................... 1025 + max_tokens_to_oom ............................... 12000 + merge_file ...................................... None + micro_batch_size ................................ 1 + min_loss_scale .................................. 1.0 + min_lr .......................................... 1e-07 + min_patch_grid .................................. 1 + mmap_bin_files .................................. True + mock_data ....................................... False + model_type ...................................... ModelType.encoder_or_decoder + moe_allgather_overlap_comm ...................... False + moe_alltoall_overlap_comm ....................... False + moe_aux_loss_coeff .............................. 0.0 + moe_comm_aux_loss_coeff ......................... 0.0 + moe_device_level_aux_loss_coeff ................. 0.0 + moe_expert_capacity_factor ...................... None + moe_grouped_gemm ................................ False + moe_input_jitter_eps ............................ None + moe_intermediate_size ........................... None + moe_layer_freq .................................. None + moe_pad_expert_input_to_capacity ................ False + moe_per_layer_logging ........................... False + moe_permutation_async_comm ...................... False + moe_router_load_balancing_type .................. aux_loss + moe_router_topk ................................. 2 + moe_token_dispatcher_type ....................... allgather + moe_token_drop_policy ........................... probs + moe_token_dropping .............................. False + moe_tp_extend_ep ................................ False + moe_train_capacity_factor ....................... 1.0 + moe_without_activation .......................... False + moe_z_loss_coeff ................................ 0.0 + moe_zero_memory ................................. disable + multi_head_latent_attention ..................... False + n_shared_experts ................................ None + nccl_communicator_config_path ................... None + next_tockens .................................... 0 + no_load_optim ................................... True + no_load_rng ..................................... True + no_persist_layer_norm ........................... False + no_post_layer_norm .............................. False + no_save_optim ................................... None + no_save_rng ..................................... None + no_shared_storage ............................... False + no_shuffle ...................................... False + noisy_gate_policy ............................... None + noop_layers ..................................... None + norm_epsilon .................................... 1e-06 + norm_topk_prob .................................. False + normalization ................................... LayerNorm + num_attention_heads ............................. 16 + num_channels .................................... 3 + num_classes ..................................... 1000 + num_experts ..................................... None + num_layer_list .................................. + num_layers ...................................... 24 + num_layers_per_virtual_pipeline_stage ........... None + num_query_groups ................................ 16 + num_workers ..................................... 8 + one_logger_entity ............................... hwinf_dcm + one_logger_project .............................. e2e-tracking + one_logger_run_name ............................. None + onnx_safe ....................................... None + openai_gelu ..................................... False + optimizer ....................................... adam + original_max_position_embeddings ................ None + output_bert_embeddings .......................... False + output_layer_slice_num .......................... 1 + output_logit_softcapping ........................ None + output_multiplier_scale ......................... None + overlap_grad_reduce ............................. False + overlap_p2p_comm ................................ False + overlap_param_gather ............................ False + override_opt_param_scheduler .................... False + pad_to_multiple_of .............................. 8 + padded_vocab_size ............................... 152064 + params_dtype .................................... torch.bfloat16 + patch_dim ....................................... 14 + perform_initialization .......................... True + pipeline_model_parallel_size .................... 1 + pipeline_model_parallel_split_rank .............. None + position_embedding_type ......................... rope + post_norm ....................................... False + pre_tockens ..................................... 65536 + pref_ftx ........................................ 0.0 + pretrained_checkpoint ........................... None + profile ......................................... False + profile_level ................................... level0 + profile_ranks ................................... [-1] + profile_record_shapes ........................... False + profile_save_path ............................... ./profile_dir + profile_step_end ................................ 12 + profile_step_start .............................. 10 + profile_with_cpu ................................ False + profile_with_memory ............................. False + profile_with_stack .............................. False + prompt_format ................................... qwen2 + prompt_type ..................................... None + q_lora_rank ..................................... None + qk_layernorm .................................... False + qk_nope_head_dim ................................ None + qk_rope_head_dim ................................ None + query_in_block_prob ............................. 0.1 + query_pre_attn_scalar ........................... None + rampup_batch_size ............................... None + rank ............................................ 0 + recompute_activation_function ................... False + recompute_activation_function_num_layers ........ None + recompute_granularity ........................... None + recompute_in_advance ............................ False + recompute_in_bubble ............................. False + recompute_method ................................ None + recompute_num_layers ............................ None + reduce_recompute_for_last_chunk ................. False + ref_model ....................................... None + reset_attention_mask ............................ False + reset_position_ids .............................. False + retriever_report_topk_accuracies ................ [] + retriever_score_scaling ......................... False + retriever_seq_length ............................ 256 + retro_add_retriever ............................. False + retro_attention_gate ............................ 1 + retro_cyclic_train_iters ........................ None + retro_encoder_attention_dropout ................. 0.1 + retro_encoder_hidden_dropout .................... 0.1 + retro_encoder_layers ............................ 2 + retro_num_neighbors ............................. 2 + retro_num_retrieved_chunks ...................... 2 + retro_project_dir ............................... None + retro_verify_neighbor_count ..................... True + reuse_fp32_param ................................ False + rope_scaling_beta_fast .......................... 32 + rope_scaling_beta_slow .......................... 1 + rope_scaling_factor ............................. 1.0 + rope_scaling_mscale ............................. 1.0 + rope_scaling_mscale_all_dim ..................... 0.0 + rope_scaling_original_max_position_embeddings ... None + rope_scaling_type ............................... None + rotary_base ..................................... 1000000.0 + rotary_interleaved .............................. False + rotary_percent .................................. 1.0 + rotary_seq_len_interpolation_factor ............. None + routed_scaling_factor ........................... None + sample_rate ..................................... 1.0 + save ............................................ /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// + save_interval ................................... 100 + scale_depth ..................................... None + scale_emb ....................................... None + scatter_gather_tensors_in_pipeline .............. True + seed ............................................ 424242 + seq_aux ......................................... False + seq_length ...................................... 1025 + sequence_parallel ............................... False + sgd_momentum .................................... 0.9 + shape_order ..................................... SBH + shared_expert_gate .............................. False + shared_expert_gate_output_dimension ............. 1 + short_seq_prob .................................. 0.1 + skip_bias_add ................................... True + skip_train ...................................... False + sliding_window .................................. None + sparse_mode ..................................... 4 + spec ............................................ None + split ........................................... 100,0,0 + square_alibi_mask ............................... False + squared_relu .................................... False + stage ........................................... None + standalone_embedding_stage ...................... False + start_weight_decay .............................. 0.0 + swap_attention .................................. False + swap_modules .................................... None + swiglu .......................................... False + swin_backbone_type .............................. tiny + tensor_model_parallel_size ...................... 8 + tensorboard_dir ................................. None + tensorboard_log_interval ........................ 1 + tensorboard_queue_size .......................... 1000 + test_data_path .................................. None + test_mode ....................................... False + timing_log_level ................................ 0 + timing_log_option ............................... minmax + titles_data_path ................................ None + tokenizer_kwargs ................................ None + tokenizer_model ................................. None + tokenizer_name_or_path .......................... /data_4/models/Qwen/Qwen2.5-14B-Instruct/ + tokenizer_not_use_fast .......................... True + tokenizer_padding_side .......................... right + tokenizer_type .................................. PretrainedFromHF + topk_group ...................................... None + tp_2d ........................................... False + tp_comm_bulk_dgrad .............................. True + tp_comm_bulk_wgrad .............................. True + tp_comm_overlap ................................. False + tp_comm_overlap_ag .............................. True + tp_comm_overlap_cfg ............................. None + tp_comm_overlap_rs .............................. True + tp_comm_split_ag ................................ True + tp_comm_split_rs ................................ True + tp_x ............................................ 1 + tp_y ............................................ 1 + train_data_path ................................. None + train_iters ..................................... 1000 + train_samples ................................... None + transformer_impl ................................ local + transformer_pipeline_model_parallel_size ........ 1 + ulysses_degree_in_cp ............................ None + untie_embeddings_and_output_weights ............. True + use_checkpoint_args ............................. False + use_checkpoint_opt_param_scheduler .............. False + use_cp_send_recv_overlap ........................ True + use_cpu_initialization .......................... None + use_deter_comp .................................. False + use_dist_ckpt ................................... False + use_distributed_optimizer ....................... True + use_flash_attn .................................. True + use_fused_moe_token_permute_and_unpermute ....... False + use_fused_ring_attention_update ................. False + use_fused_rmsnorm ............................... True + use_fused_rotary_pos_emb ........................ True + use_fused_swiglu ................................ True + use_glm_rope .................................... False + use_mc2 ......................................... False + use_mcore_models ................................ True + use_one_sent_docs ............................... False + use_ring_exchange_p2p ........................... False + use_rotary_position_embeddings .................. True + v_head_dim ...................................... None + valid_data_path ................................. None + variable_seq_lengths ............................ False + virtual_pipeline_model_parallel_size ............ None + vision_backbone_type ............................ vit + vision_context_parallel ......................... False + vision_downsample_ratio ......................... 0.5 + vision_downsample_stride ........................ 1.0 + vision_model_freeze ............................. True + vision_model_lr_decay_rate ...................... 1.0 + vision_model_lr_mult ............................ 1.0 + vision_model_recompute .......................... False + vision_model_type ............................... intern_300m + vision_normalize_type ........................... imagenet + vision_pretraining .............................. False + vision_pretraining_type ......................... classify + vision_process_type ............................. dynamic + vision_projector_freeze ......................... False + vision_projector_pre_norm ....................... True + vision_projector_recompute ...................... False + vision_projector_type ........................... mlp + vision_seq_length ............................... 1025 + vit_load ........................................ / + vocab_extra_ids ................................. 0 + vocab_file ...................................... None + vocab_size ...................................... None + wandb_exp_name .................................. + wandb_project ................................... + wandb_save_dir .................................. + weight_decay .................................... 0.0 + weight_decay_incr_style ......................... constant + world_size ...................................... 512 + yaml_cfg ........................................ None +-------------------- end of vit_args --------------------- +MegatronVisionModel vision_model_config VisionTransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=24, first_pipeline_num_layers=0, independent_parallel=True, hidden_size=1024, num_attention_heads=16, num_query_groups=16, ffn_hidden_size=4096, kv_channels=64, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=True, add_qkv_bias=True, gated_linear_unit=False, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='LayerNorm', qk_layernorm=False, test_mode=False, init_method=.init_ at 0x7fb23001b3a0>, output_layer_init_method=.init_ at 0x7fb23001be50>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=False, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=False, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=1025, rotary_percent=1.0, img_w=448, img_h=448, patch_dim=14) +MegatronVisionModel vision_model_layer_spec ModuleSpec(module=, params={}, submodules=TransformerLayerSubmodules(input_layernorm=, self_attention=ModuleSpec(module=, params={'attn_mask_type': }, submodules=SelfAttentionSubmodules(linear_qkv=, core_attention=, linear_proj=, q_layernorm=None, k_layernorm=None)), post_attn_norm=, self_attn_bda=, pre_cross_attn_layernorm=, cross_attention=, cross_attn_bda=, pre_mlp_layernorm=, mlp=ModuleSpec(module=, params={}, submodules=MLPSubmodules(linear_fc1=, linear_fc2=)), post_mlp_layernorm=, mlp_bda=, sharded_state_dict_keys_map={})) +WARNING: could not find the metadata file /latest_checkpointed_iteration.txt + will not load any checkpoints and will start from random +vision_projector_config TransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=2, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=48, first_pipeline_num_layers=0, independent_parallel=False, hidden_size=5120, num_attention_heads=40, num_query_groups=8, ffn_hidden_size=1024, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=True, gated_linear_unit=False, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=False, test_mode=False, init_method=.init_ at 0x7fb23001bee0>, output_layer_init_method=.init_ at 0x7fb23001bf70>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=False, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=True, recompute_granularity='full', recompute_method='block', recompute_num_layers=48, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=131072, rotary_percent=1.0) +vision_projector_layer_spec MLPSubmodules(linear_fc1=, linear_fc2=) +vision_model_freeze +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. + +vision_model_freeze=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +vision_model_freeze + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +vision_model_freeze + + +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + + + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +vision_model_freeze +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + + + + + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) + + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 1887497216 + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.vision_model_freeze + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +vision_model_freeze=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +vision_model_freeze=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 1887497216 +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 1887497216 + > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 1887497216 +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) + > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 1887497216 + > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 1887497216 +vision_model_freeze +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 1887497216 +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. + > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 1887497216 +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 1887497216 +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) + > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 1887497216 + > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1887497216 + > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 1887497216 +vision_model_freeze +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) + > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 1887497216 +INFO:megatron.core.distributed.param_and_grad_buffer:Number of buckets for gradient all-reduce / reduce-scatter: 39 +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 1 (97320960 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.output_layer.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 2 (43270016 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.final_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 3 (52112256 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 4 (42287872 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 5 (43270016 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 6 (52112256 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 7 (42287872 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 8 (43270016 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 9 (52112256 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 10 (42287872 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 11 (43270016 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 12 (52112256 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 13 (42287872 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 14 (43270016 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 15 (52112256 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 16 (42287872 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 17 (43270016 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 18 (52112256 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.mlp.linear_fc2.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 19 (42287872 elements): +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.self_attention.linear_qkv.bias +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.mlp.linear_fc2.weight +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.self_attention.linear_qkv.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.self_attention.linear_proj.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.input_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.pre_mlp_layernorm.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.self_attention.linear_qkv.bias +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 20 (43270016 elements): + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.input_layernorm.weight +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.pre_mlp_layernorm.weight +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.self_attention.linear_qkv.bias + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.mlp.linear_fc2.weight +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.mlp.linear_fc1.weight +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.self_attention.linear_proj.weight +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.input_layernorm.weight + +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 21 (52112256 elements): +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.self_attention.linear_proj.weight +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.mlp.linear_fc1.weight_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 22 (42287872 elements): +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.input_layernorm.weight_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.self_attention.linear_proj.weight +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.self_attention.linear_qkv.weight + + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 23 (43270016 elements): +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.self_attention.linear_proj.weight + + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 24 (52112256 elements):_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.input_layernorm.weight + + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 25 (42287872 elements): + + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.self_attention.linear_proj.weight +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 26 (43270016 elements): +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.input_layernorm.weight + +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.self_attention.linear_qkv.weight + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.self_attention.linear_proj.weight +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 27 (52112256 elements): +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.self_attention.linear_proj.weight +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.input_layernorm.weight + +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 28 (42287872 elements):_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.pre_mlp_layernorm.weight + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.self_attention.linear_qkv.bias + +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.mlp.linear_fc1.weight + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.self_attention.linear_qkv.weight + + + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.self_attention.linear_qkv.weight + +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.self_attention.linear_proj.weight +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 29 (43270016 elements): +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.mlp.linear_fc2.weight + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.mlp.linear_fc2.weight + +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.self_attention.linear_qkv.weight + +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.self_attention.linear_proj.weight + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 30 (52112256 elements): + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.mlp.linear_fc1.weight +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.self_attention.linear_proj.weight +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.self_attention.linear_qkv.weight +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.input_layernorm.weight + +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.self_attention.linear_qkv.bias_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.mlp.linear_fc2.weight + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.mlp.linear_fc1.weight + +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 31 (42287872 elements):_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.self_attention.linear_qkv.bias + + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.self_attention.linear_proj.weight + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.input_layernorm.weight_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 32 (43270016 elements): + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.input_layernorm.weight + + +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.input_layernorm.weight + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.self_attention.linear_qkv.bias + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) + + + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 33 (52112256 elements): +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) + + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + + + + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.input_layernorm.weight + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) + + + +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 34 (42287872 elements):_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.input_layernorm.weight_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.self_attention.linear_proj.weight +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.pre_mlp_layernorm.weight +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.pre_mlp_layernorm.weight + +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.mlp.linear_fc2.weight + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.mlp.linear_fc1.weight + +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 35 (43270016 elements): + +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.self_attention.linear_qkv.weight + +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.self_attention.linear_proj.weight + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.input_layernorm.weight + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.mlp.linear_fc2.weight +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 36 (52112256 elements):_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + + + + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.pre_mlp_layernorm.weight + + + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.mlp.linear_fc1.weight + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.input_layernorm.weight + + + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 37 (42287872 elements): + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.self_attention.linear_qkv.weight +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.input_layernorm.weight_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.self_attention.linear_proj.weight + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.pre_mlp_layernorm.weight + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.self_attention.linear_qkv.bias +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.mlp.linear_fc2.weight_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.mlp.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) + + + + + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.self_attention.linear_proj.weight + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 38 (97326080 elements): +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) + + +INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.input_layernorm.weight +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.embedding.word_embeddings.weight + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 39 (1187840 elements): + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.external_feature_model.pre_proj_layernorm.bias_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.external_feature_model.pre_proj_layernorm.weight +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:megatron.core.distributed.param_and_grad_buffer: module.external_feature_model.vision_projection.encoder.linear_fc1.weight +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.external_feature_model.vision_projection.encoder.linear_fc2.weight + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + + + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + + + +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +INFO:megatron.core.optimizer:Setting up optimizer with OptimizerConfig(optimizer='adam', lr=5e-06, min_lr=1e-07, decoupled_lr=None, decoupled_min_lr=None, weight_decay=0.0, fp16=False, bf16=True, params_dtype=torch.bfloat16, loss_scale=None, initial_loss_scale=4096.0, min_loss_scale=1.0, loss_scale_window=1000, hysteresis=2, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-08, sgd_momentum=0.9, use_distributed_optimizer=True, overlap_grad_reduce=True, overlap_param_gather=False, clip_grad=1.0, log_num_zeros_in_grad=False, barrier_with_L1_time=True, timers=)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) + +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +> learning rate decay style: cosine +_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration _load_base_checkpoint iteration 5000 500050005000_load_base_checkpoint iteration _load_base_checkpoint iteration 5000_load_base_checkpoint iteration +50005000 + + 5000_load_base_checkpoint release +5000 + + 500050005000 +5000 +5000 +_load_base_checkpoint release _load_base_checkpoint release + _load_base_checkpoint release +_load_base_checkpoint release_load_base_checkpoint release +_load_base_checkpoint release + _load_base_checkpoint release_load_base_checkpoint releaseFalse_load_base_checkpoint release _load_base_checkpoint release + 5000_load_base_checkpoint releaseFalse 5000 _load_base_checkpoint release + FalseFalseFalse + + +False_load_base_checkpoint release + + FalseFalse +False +False + FalseFalse +_load_base_checkpoint releaseFalse + +_load_base_checkpoint releaseFalse + + + +False +False +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt +_load_base_checkpoint_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt + +_load_base_checkpoint_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt +/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt + loading checkpoint from /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ at iteration 5000 +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +could not find arguments in the checkpoint ... +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True + checkpoint version 3.0 + successfully loaded checkpoint from /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ [ t 0, p 0 ] at iteration 0 +[after model, optimizer, and learning rate scheduler are built] datetime: 2024-11-27 12:45:29 +> building train, validation, and test datasets ... + > datasets target sizes (minimum size): + train: 64000 + validation: 0 + test: 0 +INFO:megatron.core.datasets.blended_megatron_dataset_config:mock = False +INFO:megatron.core.datasets.blended_megatron_dataset_config:Let split_matrix = [(0, 1.0), None, None] +> building train, validation, and test datasets for GPT ... +> rank 12 does not create GPT datasets ... +> rank 9 does not create GPT datasets ...> rank 5 does not create GPT datasets ... + +> rank 6 does not create GPT datasets ... +> rank 14 does not create GPT datasets ... +> rank 2 does not create GPT datasets ... +> rank 3 does not create GPT datasets ... +> rank 15 does not create GPT datasets ... +> rank 4 does not create GPT datasets ...> rank 10 does not create GPT datasets ... +> rank 11 does not create GPT datasets ... + +> rank 7 does not create GPT datasets ... +> rank 13 does not create GPT datasets ... +> rank 8 is creating GPT datasets ... +tokenizer Qwen2TokenizerFast(name_or_path='/data_4/models/Qwen/Qwen2.5-14B-Instruct/', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False), added_tokens_decoder={ + 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151657: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), + 151658: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), + 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), + 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), + 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), + 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), + 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), + 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), + 151665: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151666: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151667: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151668: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151669: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151670: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151671: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151672: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151673: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151674: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151675: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151676: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151677: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151678: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151679: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151680: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 151681: AddedToken("