Spaces:
Running
on
Zero
Running
on
Zero
# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright: | |
# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: | |
# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: | |
# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import re | |
import os | |
import copy | |
import json | |
import random | |
import pathlib | |
import traceback | |
from dataclasses import dataclass, field | |
from typing import Dict, Optional, Sequence, List | |
# torch-related packages | |
# NOTE: torch must be imported before transformers. Otherwise, `Segmentation fault (core dumped)` will occur. | |
import torch | |
from torch.utils.data import Dataset | |
import transformers | |
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock | |
import sys | |
sys.path.append('./') | |
from videollama2.model import * | |
from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MODAL_INDEX_MAP | |
from videollama2.mm_utils import tokenizer_multimodal_token, process_video, process_image, process_audio_file | |
from videollama2.videollama2_trainer import (VideoLLaMA2Trainer, | |
get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3, | |
find_all_linear_names, safe_save_model_for_hf_trainer | |
) | |
# NOTE: fast tokenizer warning issue: https://github.com/huggingface/transformers/issues/5486 | |
os.environ["TOKENIZERS_PARALLELISM"] = "true" | |
local_rank = None | |
def rank0_print(*args): | |
if local_rank == 0: | |
print(*args) | |
def set_seed(seed=42): | |
""" | |
Set the random seed for reproducible results. | |
:param seed: An integer value to be used as the random seed. | |
""" | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) # for multi-GPU setups | |
torch.backends.cudnn.deterministic = True | |
torch.backends.cudnn.benchmark = False | |
class ModelArguments: | |
# LLM Arguments | |
model_type: Optional[str] = field(default="videollama2", metadata={"help": "Model type selected in the list: " + ", ".join(VLLMs.keys())}) | |
model_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5") | |
version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."}) | |
freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."}) | |
tune_adapter_llm: bool = field(default=False) | |
# Connector Arguments | |
mm_projector_type: Optional[str] = field(default='linear') | |
mm_projector_a_type: Optional[str] = field(default='linear') | |
tune_mm_mlp_adapter: bool = field(default=False) | |
tune_mm_mlp_adapter_a: bool = field(default=False) | |
pretrain_mm_mlp_adapter: Optional[str] = field(default=None) | |
pretrain_mm_mlp_adapter_a: Optional[str] = field(default=None) | |
# Vision tower Arguments | |
vision_tower: Optional[str] = field(default=None) | |
mm_vision_select_layer: Optional[int] = field(default=-1) | |
mm_vision_select_feature: Optional[str] = field(default="patch") | |
# Audio tower Arguments | |
audio_tower: Optional[str] = field(default=None) | |
tune_audio_tower: bool = field(default=False) | |
class DataArguments: | |
# Path Arguments | |
data_path: str = field(default=None, metadata={"help": "Path to the training data."}) | |
data_path_a: Optional[str] = field(default=None, metadata={"help": "Path to the audio data."}) | |
# image_folder: Optional[str] = field(default=None) | |
# video_folder: Optional[str] = field(default=None) | |
data_folder: Optional[str] = field(default=None) | |
# Loading Arguments | |
is_multimodal: bool = False | |
va: bool = field(default=False) | |
lazy_preprocess: bool = False | |
num_frames: Optional[int] = field(default=None) | |
# Preprocess Arguments | |
image_aspect_ratio: str = 'square' | |
class TrainingArguments(transformers.TrainingArguments): | |
optim: str = field(default="adamw_torch") | |
mm_projector_lr: Optional[float] = None | |
freeze_mm_mlp_adapter: bool = field(default=False) | |
remove_unused_columns: bool = field(default=False) | |
cache_dir: Optional[str] = field(default=None) | |
# Training Data Arguments | |
group_by_modality_length: bool = field(default=False) | |
model_max_length: int = field( | |
default=512, | |
metadata={ | |
"help": | |
"Maximum sequence length. Sequences will be right padded (and possibly truncated)." | |
}, | |
) | |
# Lora or Quant Arguments | |
double_quant: bool = field( | |
default=True, | |
metadata={"help": "Compress the quantization statistics through double quantization."} | |
) | |
quant_type: str = field( | |
default="nf4", | |
metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} | |
) | |
bits: int = field( | |
default=16, | |
metadata={"help": "How many bits to use."} | |
) | |
lora_enable: bool = False | |
lora_r: int = 64 | |
lora_alpha: int = 16 | |
lora_dropout: float = 0.05 | |
lora_weight_path: str = "" | |
lora_bias: str = "none" | |
def preprocess_plain( | |
sources: Sequence[str], | |
tokenizer: transformers.PreTrainedTokenizer, | |
modal_token: str = None, | |
) -> Dict: | |
roles = {"human": "user", "gpt": "assistant"} | |
conversations = [] | |
input_ids = [] | |
targets = [] | |
#print(sources) | |
for source in sources: | |
# 1. apply chat template for input conversation | |
assert len(source) == 2 | |
assert modal_token in source[0]['value'] | |
message = [ | |
{'role': 'user', 'content': modal_token}, | |
{'role': 'assistant', 'content': source[1]['value']} | |
] | |
conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) | |
#print(conversation) //<s> [INST] <audio> [/INST] Someone is speaking.</s> | |
# 2. tokenize conversations | |
input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt')) | |
# 3. make targets | |
targets.append(copy.deepcopy(input_ids[-1])) | |
#print(targets) | |
instruction = tokenizer.apply_chat_template(message[:1], tokenize=False, add_generation_prompt=True) | |
#print(instruction) //<s> [INST] <audio> [/INST] | |
instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt')) | |
#print(instruction_len) //12 | |
targets[-1][:instruction_len] = IGNORE_INDEX | |
# print("instruction: ----------------") | |
# print(instruction) | |
# print("conversation: ----------------") | |
# print(conversation) | |
# print("training targets: ----------------") | |
# print(tokenizer.decode(targets[-1][instruction_len:])) | |
# print(input_ids[-1]) | |
# print(targets[-1]) | |
return dict(input_ids=input_ids, labels=targets) | |
def preprocess( | |
sources: Sequence[str], | |
tokenizer: transformers.PreTrainedTokenizer, | |
modal_token: str = None, | |
) -> Dict: | |
roles = {"human": "user", "gpt": "assistant"} | |
# Apply prompt templates | |
conversations = [] | |
input_ids = [] | |
targets = [] | |
for i, source in enumerate(sources): | |
if roles[source[0]["from"]] != "user": | |
# Skip the first one if it is not from human | |
source = source[1:] | |
message = [{'role': roles[sentence['from']], 'content': sentence['value']} for sentence in source] | |
conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) | |
#print(conversation) | |
input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt')) | |
#print(input_ids) | |
targets.append(copy.deepcopy(input_ids[-1])) | |
#print(targets) | |
assert len(source) % 2 == 0, f"Invalid conversation length {len(source)}." | |
cur = 0 | |
message = [] | |
for idx, sentence in enumerate(source): | |
if idx % 2 == 1: | |
tmp_message = [ | |
{'role': roles[source[idx-1]['from']], 'content': source[idx-1]['value']}, | |
{'role': roles[sentence['from']], 'content': sentence['value']} | |
] | |
instruction = tokenizer.apply_chat_template(message + tmp_message[:1], tokenize=False, add_generation_prompt=True) | |
conversation = tokenizer.apply_chat_template(message + tmp_message, tokenize=False, add_generation_prompt=False) | |
instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt')) | |
conversation_len = len(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt')) | |
targets[-1][cur:instruction_len] = IGNORE_INDEX | |
#print(targets[-1]) | |
cur = conversation_len | |
message += tmp_message | |
return dict(input_ids=input_ids, labels=targets) | |
def preprocess_multimodal( | |
sources: Sequence[str], | |
data_args: DataArguments, | |
modal_token: str = None, | |
) -> Dict: | |
is_multimodal = data_args.is_multimodal | |
if not is_multimodal: | |
return sources | |
assert modal_token in MODAL_INDEX_MAP, f"Unsupported modal token {modal_token}." | |
for source in sources: | |
for sentence in source: | |
if modal_token in sentence['value']: | |
sentence['value'] = sentence['value'].replace(modal_token, '').strip() | |
sentence['value'] = modal_token + '\n' + sentence['value'] | |
sentence['value'] = sentence['value'].strip() | |
replace_token = modal_token | |
# TODO: fix this for multimedia, e.g., <video>, <audio>, etc. | |
sentence["value"] = sentence["value"].replace(modal_token, replace_token) | |
return sources | |
class LazySupervisedDataset(Dataset): | |
"""Dataset for supervised fine-tuning.""" | |
def __init__(self, data_path: str, data_path_a: str, | |
tokenizer: transformers.PreTrainedTokenizer, | |
data_args: DataArguments): | |
super(LazySupervisedDataset, self).__init__() | |
self.mix_sampler_tag = False | |
if data_path is not None and len(data_path.split(",")) == 1: | |
data_path = data_path.split(",")[0] | |
list_data_dict = json.load(open(data_path, "r")) | |
elif data_path is not None and len(data_path.split(",")) > 1: | |
self.mix_sampler_tag = True | |
data_path = data_path.split(",") | |
for path in data_path: | |
if "stage3" in path: | |
self.av_data = json.load(open(path, "r")) | |
random.shuffle(self.av_data) | |
elif "stage2" in path and "audio" in path: | |
self.a_data = json.load(open(path, "r")) | |
random.shuffle(self.a_data) | |
elif "stage2" in path and "video" in path: | |
self.v_data = json.load(open(path, "r")) | |
random.shuffle(self.v_data) | |
else: | |
raise NotImplementedError | |
list_data_dict = self.av_data + self.a_data + self.v_data | |
if data_path_a is not None: | |
list_data_dict = json.load(open(data_path_a, "r")) | |
rank0_print("Formatting inputs...Skip in lazy mode") | |
self.tokenizer = tokenizer | |
self.list_data_dict = list_data_dict | |
self.data_args = data_args | |
def __len__(self): | |
return len(self.list_data_dict) | |
def lengths(self): | |
length_list = [] | |
for sample in self.list_data_dict: | |
img_tokens = 576 if 'image' in sample else 0 | |
length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens) | |
return length_list | |
def modality_lengths(self): | |
length_list = [] | |
for sample in self.list_data_dict: | |
cur_len = sum(len(conv['value'].split()) for conv in sample['conversations']) | |
cur_len = cur_len if 'image' in sample else -cur_len | |
length_list.append(cur_len) | |
return length_list | |
def __getitem__(self, i) -> Dict[str, torch.Tensor]: | |
sources = self.list_data_dict[i] | |
if isinstance(i, int): | |
sources = [sources] | |
assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME | |
if self.data_args.data_path is not None: | |
image_processor = self.data_args.image_processor | |
video_processor = self.data_args.video_processor | |
num_frames = NUM_FRAMES if self.data_args.num_frames is None else self.data_args.num_frames | |
if 'image' in sources[0]: | |
image_file = self.list_data_dict[i]['image'] | |
image_folder = self.data_args.data_folder | |
image_file = os.path.join(image_folder, image_file) | |
try: | |
image = process_image(image_file, image_processor, aspect_ratio=self.data_args.image_aspect_ratio) | |
except: | |
traceback.print_exc() | |
backup_idx = random.randint(0, len(self.list_data_dict) - 1) | |
print(f"Encounted error when reading image {image_file}, use {backup_idx}-th example instead!!!") | |
return self.__getitem__(backup_idx) | |
# place <image> tag to question head. | |
modal_token = "<image>" | |
sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token) | |
elif 'video' in sources[0]: | |
video_file = self.list_data_dict[i]['video'] | |
video_folder = self.data_args.data_folder | |
if video_folder: | |
video_file = os.path.join(video_folder, video_file) | |
try: | |
video = process_video(video_file, video_processor, aspect_ratio=self.data_args.image_aspect_ratio, num_frames=num_frames, va = self.data_args.va if not self.mix_sampler_tag else (i < len(self.av_data))) | |
except Exception as e: | |
traceback.print_exc() | |
backup_idx = random.randint(0, len(self.list_data_dict) - 1) | |
print(f"Encounted error when reading video {video_file}, use {backup_idx}-th example instead!!!") | |
return self.__getitem__(backup_idx) | |
# place <video> tag to question head. | |
modal_token = "<video>" | |
sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token) | |
elif 'audio' in sources[0]: | |
audio_file = self.list_data_dict[i]['audio'] | |
#audio_folder = self.data_args.base_folder | |
#print(audio_file) | |
try: | |
audio = process_audio_file(audio_file) | |
except Exception as e: | |
print(e) | |
backup_idx = random.randint(0, len(self.list_data_dict)-1) | |
print(f"Encounted error when reading audio {audio_file}, use {backup_idx}-th example instead!!!") | |
return self.__getitem__(backup_idx) | |
modal_token = "<audio>" | |
sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token) | |
else: | |
modal_token = None | |
sources = copy.deepcopy([e["conversations"] for e in sources]) | |
if self.data_args.is_pretraining: | |
data_dict = preprocess_plain(sources, self.tokenizer, modal_token=modal_token) | |
else: | |
data_dict = preprocess(sources, self.tokenizer, modal_token=modal_token) | |
if isinstance(i, int): | |
data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0]) | |
# image exist in the data | |
if 'image' in self.list_data_dict[i]: | |
data_dict['image'] = image | |
elif 'video' in self.list_data_dict[i]: | |
data_dict['video'] = video | |
elif 'audio' in self.list_data_dict[i]: | |
data_dict['audio'] = audio | |
elif self.data_args.data_path_a: | |
# image does not exist in the data, but the model is multimodal | |
data_dict['audio'] = torch.zeros(1, 2998, 128) | |
elif self.data_args.is_multimodal: | |
# image does not exist in the data, but the model is multimodal | |
data_dict['image'] = torch.zeros(3, self.data_args.image_size, self.data_args.image_size) | |
return data_dict | |
class DataCollatorForSupervisedDataset(object): | |
"""Collate examples for supervised fine-tuning.""" | |
tokenizer: transformers.PreTrainedTokenizer | |
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: | |
input_ids, labels = tuple([instance[key] for instance in instances] | |
for key in ("input_ids", "labels")) | |
input_ids = torch.nn.utils.rnn.pad_sequence( | |
input_ids, | |
batch_first=True, | |
padding_value=self.tokenizer.pad_token_id) | |
labels = torch.nn.utils.rnn.pad_sequence(labels, | |
batch_first=True, | |
padding_value=IGNORE_INDEX) | |
input_ids = input_ids[:, :self.tokenizer.model_max_length] | |
labels = labels[:, :self.tokenizer.model_max_length] | |
batch = dict( | |
input_ids=input_ids, | |
labels=labels, | |
attention_mask=input_ids.ne(self.tokenizer.pad_token_id), | |
) | |
# work for 'images' argument in `prepare_inputs_labels_for_multimodal` of LlavaMetaForCausalLM in llava_arch.py | |
batch['images'] = [] | |
for instance in instances: | |
for modal_token in MODAL_INDEX_MAP.keys(): | |
modal_token = modal_token.lower() | |
# MODAL_TOKEN shape like: <image>, <video>, ... | |
modal_name = re.findall(f'[<](.*)[>]', modal_token) | |
assert len(modal_name) == 1 | |
modal_name = modal_name[0] | |
if modal_name in instance: | |
batch['images'].append((instance[modal_name], modal_name)) | |
return batch | |
def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, | |
data_args) -> Dict: | |
"""Make dataset and collator for supervised fine-tuning.""" | |
train_dataset = LazySupervisedDataset( | |
tokenizer=tokenizer, | |
data_path=data_args.data_path, | |
data_path_a=data_args.data_path_a, | |
data_args=data_args | |
) | |
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) | |
return dict(train_dataset=train_dataset, | |
eval_dataset=None, | |
data_collator=data_collator) | |
def train(attn_implementation="flash_attention_2"): | |
global local_rank | |
set_seed(42) | |
parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) | |
model_args, data_args, training_args = parser.parse_args_into_dataclasses() | |
local_rank = training_args.local_rank | |
compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) | |
bnb_model_from_pretrained_args = {} | |
if training_args.bits in [4, 8]: | |
from transformers import BitsAndBytesConfig | |
bnb_model_from_pretrained_args.update(dict( | |
# device_map={"": training_args.device}, | |
# BUG: High version transformers report error: | |
# ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time | |
# load_in_4bit=training_args.bits == 4, | |
# load_in_8bit=training_args.bits == 8, | |
quantization_config=BitsAndBytesConfig( | |
load_in_4bit=training_args.bits == 4, | |
load_in_8bit=training_args.bits == 8, | |
llm_int8_skip_modules=["mm_projector"], | |
llm_int8_threshold=6.0, | |
llm_int8_has_fp16_weight=False, | |
bnb_4bit_compute_dtype=compute_dtype, | |
bnb_4bit_use_double_quant=training_args.double_quant, | |
bnb_4bit_quant_type=training_args.quant_type, # {'fp4', 'nf4'} | |
bnb_4bit_quant_storage=compute_dtype, | |
) | |
)) | |
config = VLLMConfigs[model_args.model_type].from_pretrained(model_args.model_path, trust_remote_code=True) | |
if 'gemma2' in model_args.model_type: | |
config._attn_implementation = 'eager' | |
else: | |
config._attn_implementation = attn_implementation | |
if model_args.vision_tower is not None or model_args.audio_tower is not None: | |
model = VLLMs[model_args.model_type].from_pretrained( | |
model_args.model_path, | |
config=config, | |
cache_dir=training_args.cache_dir, | |
torch_dtype=(torch.bfloat16 if training_args.bf16 else None), | |
do_sample=True, | |
**bnb_model_from_pretrained_args | |
) | |
if 'mixtral' in model_args.model_type: | |
import deepspeed | |
deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock]) | |
else: | |
model = transformers.LlamaForCausalLM.from_pretrained( | |
model_args.model_path, | |
config=config, | |
cache_dir=training_args.cache_dir, | |
torch_dtype=(torch.bfloat16 if training_args.bf16 else None), | |
do_sample=True, | |
**bnb_model_from_pretrained_args | |
) | |
model.config.use_cache = False | |
if training_args.bits in [4, 8]: | |
from peft import prepare_model_for_kbit_training | |
model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) | |
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) | |
if training_args.gradient_checkpointing: | |
if hasattr(model, "enable_input_require_grads"): | |
model.enable_input_require_grads() | |
else: | |
def make_inputs_require_grad(module, input, output): | |
output.requires_grad_(True) | |
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) | |
if training_args.lora_enable: | |
from peft import LoraConfig, get_peft_model | |
lora_config = LoraConfig( | |
r=training_args.lora_r, | |
lora_alpha=training_args.lora_alpha, | |
target_modules=find_all_linear_names(model), | |
lora_dropout=training_args.lora_dropout, | |
bias=training_args.lora_bias, | |
task_type="CAUSAL_LM", | |
) | |
if training_args.bits == 16: | |
if training_args.bf16: | |
model.to(torch.bfloat16) | |
if training_args.fp16: | |
model.to(torch.float16) | |
rank0_print("Adding LoRA adapters...") | |
model = get_peft_model(model, lora_config) | |
tokenizer = transformers.AutoTokenizer.from_pretrained( | |
model_args.model_path, | |
cache_dir=training_args.cache_dir, | |
model_max_length=training_args.model_max_length, | |
padding_side="right", | |
use_fast=True, | |
) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.unk_token | |
if model_args.vision_tower is not None: | |
# initialize vision encoder + multi-modal projector | |
model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp) | |
vision_tower = model.get_vision_tower() | |
vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) | |
data_args.image_size = vision_tower.image_size | |
data_args.image_processor = vision_tower.image_processor | |
data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor | |
data_args.is_multimodal = True | |
model.config.image_aspect_ratio = data_args.image_aspect_ratio | |
model.config.tokenizer_padding_side = tokenizer.padding_side | |
model.config.tokenizer_model_max_length = tokenizer.model_max_length | |
model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter | |
if model_args.tune_mm_mlp_adapter: | |
model.requires_grad_(False) | |
for p in model.get_model().mm_projector.parameters(): | |
p.requires_grad = True | |
if model_args.tune_mm_mlp_adapter: | |
data_args.is_pretraining = True | |
else: | |
data_args.is_pretraining = False | |
model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter | |
if training_args.freeze_mm_mlp_adapter: | |
for p in model.get_model().mm_projector.parameters(): | |
p.requires_grad = False | |
if training_args.bits in [4, 8]: | |
model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) | |
model.config.mm_projector_lr = training_args.mm_projector_lr | |
model.config.num_frames = NUM_FRAMES if data_args.num_frames is None else data_args.num_frames | |
if model_args.audio_tower is not None: | |
# initialize audio encoder + multi-modal projector | |
model.get_model().initialize_audio_modules( | |
model_args=model_args, | |
fsdp=training_args.fsdp | |
) | |
audio_tower = model.get_audio_tower() | |
audio_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) | |
data_args.is_multimodal = True | |
model.config.tokenizer_padding_side = tokenizer.padding_side | |
model.config.tokenizer_model_max_length = tokenizer.model_max_length | |
model.config.tune_mm_mlp_adapter_a = training_args.tune_mm_mlp_adapter_a = model_args.tune_mm_mlp_adapter_a | |
training_args.pretrain_mm_mlp_adapter_a = model_args.pretrain_mm_mlp_adapter_a | |
training_args.tune_audio_tower = model_args.tune_audio_tower | |
# only update mm_mlp's parameters while the remaining ones are kept frozen | |
if model_args.tune_mm_mlp_adapter_a: | |
model.requires_grad_(False) | |
for p in model.get_model().mm_projector_a.parameters(): | |
p.requires_grad = True | |
if model_args.tune_audio_tower or model_args.tune_adapter_llm: | |
data_args.is_pretraining = False | |
else: | |
data_args.is_pretraining = True | |
model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter | |
if training_args.freeze_mm_mlp_adapter: | |
for p in model.get_model().mm_projector_a.parameters(): | |
p.requires_grad = False | |
if model_args.tune_adapter_llm: | |
model.requires_grad_(True) | |
if hasattr(model.get_model(), 'vision_tower'): | |
for p in model.get_model().vision_tower.parameters(): | |
p.requires_grad = False | |
for p in model.get_model().audio_tower.parameters(): | |
p.requires_grad = False | |
if model_args.freeze_backbone: | |
model.requires_grad_(False) | |
if model_args.tune_audio_tower: | |
for p in model.get_model().audio_tower.parameters(): | |
p.requires_grad = True | |
else: | |
for p in model.get_model().audio_tower.parameters(): | |
p.requires_grad = False | |
if training_args.bits in [4, 8]: | |
model.get_model().mm_projector_a.to(dtype=compute_dtype, device=training_args.device) | |
model.config.mm_projector_lr = training_args.mm_projector_lr | |
if training_args.bits in [4, 8]: | |
from peft.tuners.lora import LoraLayer | |
for name, module in model.named_modules(): | |
if isinstance(module, LoraLayer): | |
if training_args.bf16: | |
module = module.to(torch.bfloat16) | |
if 'norm' in name: | |
module = module.to(torch.float32) | |
if 'lm_head' in name or 'embed_tokens' in name: | |
if hasattr(module, 'weight'): | |
if training_args.bf16 and module.weight.dtype == torch.float32: | |
module = module.to(torch.bfloat16) | |
print("Current model:", model) | |
''' | |
for name, param in model.named_parameters(): | |
# Check if the parameter requires gradient | |
if param.requires_grad: | |
print(f'Parameter: {name} is trainable') | |
else: | |
print(f'Parameter: {name} is frozen') | |
''' | |
data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args) | |
# select a Trainer | |
trainer = VideoLLaMA2Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) | |
if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): | |
trainer.train(resume_from_checkpoint=True) | |
else: | |
trainer.train() | |
trainer.save_state() | |
model.config.use_cache = True | |
if training_args.lora_enable: | |
state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias) | |
non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters()) | |
if training_args.local_rank == 0 or training_args.local_rank == -1: | |
model.config.save_pretrained(training_args.output_dir) | |
model.save_pretrained(training_args.output_dir, state_dict=state_dict) | |
torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin')) | |
else: | |
safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) | |
if __name__ == "__main__": | |
train() | |