# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import os
import copy
import json
import random
import pathlib
import traceback
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence, List
# torch-related packages
# NOTE: torch must be imported before transformers. Otherwise, `Segmentation fault (core dumped)` will occur.
import torch
from torch.utils.data import Dataset
import transformers
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
import sys
sys.path.append('./')
from videollama2.model import *
from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MODAL_INDEX_MAP
from videollama2.mm_utils import tokenizer_multimodal_token, process_video, process_image, process_audio_file
from videollama2.videollama2_trainer import (VideoLLaMA2Trainer,
get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3,
find_all_linear_names, safe_save_model_for_hf_trainer
)
# NOTE: fast tokenizer warning issue: https://github.com/huggingface/transformers/issues/5486
os.environ["TOKENIZERS_PARALLELISM"] = "true"
local_rank = None
def rank0_print(*args):
if local_rank == 0:
print(*args)
def set_seed(seed=42):
"""
Set the random seed for reproducible results.
:param seed: An integer value to be used as the random seed.
"""
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # for multi-GPU setups
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
@dataclass
class ModelArguments:
# LLM Arguments
model_type: Optional[str] = field(default="videollama2", metadata={"help": "Model type selected in the list: " + ", ".join(VLLMs.keys())})
model_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5")
version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."})
freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."})
tune_adapter_llm: bool = field(default=False)
# Connector Arguments
mm_projector_type: Optional[str] = field(default='linear')
mm_projector_a_type: Optional[str] = field(default='linear')
tune_mm_mlp_adapter: bool = field(default=False)
tune_mm_mlp_adapter_a: bool = field(default=False)
pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
pretrain_mm_mlp_adapter_a: Optional[str] = field(default=None)
# Vision tower Arguments
vision_tower: Optional[str] = field(default=None)
mm_vision_select_layer: Optional[int] = field(default=-1)
mm_vision_select_feature: Optional[str] = field(default="patch")
# Audio tower Arguments
audio_tower: Optional[str] = field(default=None)
tune_audio_tower: bool = field(default=False)
@dataclass
class DataArguments:
# Path Arguments
data_path: str = field(default=None, metadata={"help": "Path to the training data."})
data_path_a: Optional[str] = field(default=None, metadata={"help": "Path to the audio data."})
# image_folder: Optional[str] = field(default=None)
# video_folder: Optional[str] = field(default=None)
data_folder: Optional[str] = field(default=None)
# Loading Arguments
is_multimodal: bool = False
va: bool = field(default=False)
lazy_preprocess: bool = False
num_frames: Optional[int] = field(default=None)
# Preprocess Arguments
image_aspect_ratio: str = 'square'
@dataclass
class TrainingArguments(transformers.TrainingArguments):
optim: str = field(default="adamw_torch")
mm_projector_lr: Optional[float] = None
freeze_mm_mlp_adapter: bool = field(default=False)
remove_unused_columns: bool = field(default=False)
cache_dir: Optional[str] = field(default=None)
# Training Data Arguments
group_by_modality_length: bool = field(default=False)
model_max_length: int = field(
default=512,
metadata={
"help":
"Maximum sequence length. Sequences will be right padded (and possibly truncated)."
},
)
# Lora or Quant Arguments
double_quant: bool = field(
default=True,
metadata={"help": "Compress the quantization statistics through double quantization."}
)
quant_type: str = field(
default="nf4",
metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
)
bits: int = field(
default=16,
metadata={"help": "How many bits to use."}
)
lora_enable: bool = False
lora_r: int = 64
lora_alpha: int = 16
lora_dropout: float = 0.05
lora_weight_path: str = ""
lora_bias: str = "none"
def preprocess_plain(
sources: Sequence[str],
tokenizer: transformers.PreTrainedTokenizer,
modal_token: str = None,
) -> Dict:
roles = {"human": "user", "gpt": "assistant"}
conversations = []
input_ids = []
targets = []
#print(sources)
for source in sources:
# 1. apply chat template for input conversation
assert len(source) == 2
assert modal_token in source[0]['value']
message = [
{'role': 'user', 'content': modal_token},
{'role': 'assistant', 'content': source[1]['value']}
]
conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
#print(conversation) // [INST]
# 2. tokenize conversations
input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
# 3. make targets
targets.append(copy.deepcopy(input_ids[-1]))
#print(targets)
instruction = tokenizer.apply_chat_template(message[:1], tokenize=False, add_generation_prompt=True)
#print(instruction) // [INST]