target: !module src.models.pl_htsat_q_bart_captioning.AutoCap variables: num_workers: &num_workers 90 sampling_rate: &sampling_rate 32000 warmup_epochs: &warmup_epochs 2 lr: &lr 1.0e-5 batch_size: &bs 128 training: seed: 20 pretrain: True pretrain_path: "PRETAINED_CHECKPOINT" resume_training: False # if true, the most recent checkpoint will be found in the log folder and used to initalize the training precision: "high" nodes_count: -1 # if -1, train on the whole world size. For multinode training, please lunch the module with torch.distributed.run device: "cuda" exclude_metrics: ['spice', 'meteor', 'spider'] logging: project_name: "autocap" wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize) log_directory: "./run_logs/autocap/train" # (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely # S3_BUCKET: "YOUR_S3_BUCKET" # S3_FOLDER: 'YOUR_S3_FOLDER' save_checkpoint_every_n_epochs: 5 save_top_k: -1 step: epochs: 20 validation_every_n_epochs: 1 num_sanity_val_steps: 1 # debug # limit_train_batches: 20 # limit_val_batches: 2 model: clip_grad: 2 audio_features_dropout_p: 0.5 text_features_dropout_p: 0.5 use_text_qformer: false # if not, then append the the text tokens are directly fed to the decoder use_audio_qformer: true # if not, then the audio features are directly fed to the decoder use_clap_embeds: true meta_input: true add_special_tokens: True # If not then the meat data will start with Title:, Caption:, etc resize_token_embeds: True meta_keys: ['video_caption', 'title'] # meta_keys: ['video_caption', 'videollama_caption', 'title', 'description', 'subtitle', 'labels'] meta: max_prompt_len : 128 clap_embeds: model: 'HTSAT-base' ckpt: 'pretrained_models/clap/music_speech_audioset_epoch_15_esc_89.98.pt' embed_dim: 512 text_qformer: num_text_query_token: 64 # output tokens input_audio2tex_query_embed : true detach_video_query_embed: false frozen_text_Qformer: false hidden_size: 128 add_cross_attention: true num_attention_heads: 8 num_hidden_layers: 2 audio_qformer: num_audio_query_token: 256 frozen_audio_Qformer: false hidden_size: 256 add_cross_attention: true num_attention_heads: 8 num_hidden_layers: 2 tokenizer: max_length: 30 special_tokens: ['', '', '', '', '', '', '', '', '', '', '', ''] audio_args: sr: 32000 n_fft: 1024 hop_length: 320 f_min: 50 f_max: 14000 n_mels: 64 max_length: 10 # set to 10 for HTSAT encoder, and set to 0 or 30 for CNN encoder mono: True # audiocaps: audiocaps_gt_captions # audioset: no caption, labels are available # 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible' :wavcaps_caption # clotho: gt_captions # fs50k: no caption, labels are available data_args: data: metadata_root: "../dataset_preperation/data/metadata/dataset_root.json" train: ['32k_captioned_audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k'] val: ['autocap'] test: ['autocap'] keys_synonyms: gt_audio_caption: - audiocaps_gt_captions - gt_captions - gt_caption - caption - gt_audio_caption - autocap_caption - wavcaps_caption tags: - keywords - tags - labels batch_size: *bs num_workers: *num_workers augmentation_p : 0.1 preprocessing: video: fps : 1 height: 224 width: 224 audio: sampling_rate: *sampling_rate max_wav_value: 32768.0 duration: 10.0 stft: filter_length: 1024 hop_length: 320 win_length: 1024 mel: n_mel_channels: 64 mel_fmin: 50 mel_fmax: 14000 audio_encoder_args: model_arch: "transformer" model_name: "htsat" pretrained: True freeze: True spec_augment: True text_decoder_args: model_tag: "audio_qformer" name: "facebook/bart-base" pretrained: true freeze: False freeze_embed_layer: False bert_args: attention_probs_dropout_prob: 0.2 hidden_act: "gelu" hidden_dropout_prob: 0.2 hidden_size: 768 initializer_range: 0.02 intermediate_size: 2048 layer_norm_eps: !!float 1e-5 max_position_embeddings: 128 model_type: "bert" num_attention_heads: 4 num_hidden_layers: 2 add_type_embeddings: false vocab_size: 30522 add_cross_attention: true is_decoder: true num_labels: 0 name: "bert-base-uncased" optim_args: scheduler: cosine lr: *lr optimizer_name: "adam" betas: [0.9, 0.999] eps: !!float 1e-8 momentum: 0.9 gamma: 0.05 warmup_epochs: *warmup_epochs weight_decay: !!float 1e-6