target: !module src.models.pl_htsat_q_bart_captioning.AutoCap
variables:
num_workers: &num_workers 90
sampling_rate: &sampling_rate 32000
warmup_epochs: &warmup_epochs 2
lr: &lr 1.0e-5
batch_size: &bs 128
training:
seed: 20
pretrain: True
pretrain_path: "PRETAINED_CHECKPOINT"
resume_training: False # if true, the most recent checkpoint will be found in the log folder and used to initalize the training
precision: "high"
nodes_count: -1 # if -1, train on the whole world size. For multinode training, please lunch the module with torch.distributed.run
device: "cuda"
exclude_metrics: ['spice', 'meteor', 'spider']
logging:
project_name: "autocap"
wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
log_directory: "./run_logs/autocap/train"
# (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
# S3_BUCKET: "YOUR_S3_BUCKET"
# S3_FOLDER: 'YOUR_S3_FOLDER'
save_checkpoint_every_n_epochs: 5
save_top_k: -1
step:
epochs: 20
validation_every_n_epochs: 1
num_sanity_val_steps: 1
# debug
# limit_train_batches: 20
# limit_val_batches: 2
model:
clip_grad: 2
audio_features_dropout_p: 0.5
text_features_dropout_p: 0.5
use_text_qformer: false # if not, then append the the text tokens are directly fed to the decoder
use_audio_qformer: true # if not, then the audio features are directly fed to the decoder
use_clap_embeds: true
meta_input: true
add_special_tokens: True # If not then the meat data will start with Title:, Caption:, etc
resize_token_embeds: True
meta_keys: ['video_caption', 'title']
# meta_keys: ['video_caption', 'videollama_caption', 'title', 'description', 'subtitle', 'labels']
meta:
max_prompt_len : 128
clap_embeds:
model: 'HTSAT-base'
ckpt: 'pretrained_models/clap/music_speech_audioset_epoch_15_esc_89.98.pt'
embed_dim: 512
text_qformer:
num_text_query_token: 64 # output tokens
input_audio2tex_query_embed : true
detach_video_query_embed: false
frozen_text_Qformer: false
hidden_size: 128
add_cross_attention: true
num_attention_heads: 8
num_hidden_layers: 2
audio_qformer:
num_audio_query_token: 256
frozen_audio_Qformer: false
hidden_size: 256
add_cross_attention: true
num_attention_heads: 8
num_hidden_layers: 2
tokenizer:
max_length: 30
special_tokens: ['