mali6
/

autocap

Model card Files Files and versions Community

File size: 11,662 Bytes

b366428


training:
  precision: "high"
  nodes_count: -1

logging: 
  project_name: "audioldm-snap"
  wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
  log_directory: "./log/latent_diffusion"

  # Saving Checkpoints
  # if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
  S3_BUCKET: "snap-genvid"
  S3_FOLDER: 'mali6/audioldm'
  save_checkpoint_every_n_steps: 1500
  save_top_k: -1
  

variables:
  sampling_rate: &sampling_rate 16000 
  mel_bins: &mel_bins 64
  latent_embed_dim: &latent_embed_dim 64
  latent_t_size: &latent_t_size 256 # TODO might need to change
  latent_f_size: &latent_f_size 1
  in_channels: &unet_in_channels 256
  optimize_ddpm_parameter: &optimize_ddpm_parameter true
  optimize_gpt: &optimize_gpt true
  warmup_steps: &warmup_steps 5000
  lr: &lr 5.0e-3
  mx_steps: &mx_steps 80000000
  batch_size: &bs 20 # TODO: change to 256

data: 
  metadata_root: "/fsx/mali6/datasets/metadata/dataset_root.json"
  train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
  val: "audiocaps"
  test: "audiocaps"
  class_label_indices: "audioset_eval_subset"
  dataloader_add_ons: [] 
  augment_p : 0.0
  num_workers: 48
  consistent_start_time: True 

  keys_synonyms:
    gt_audio_caption:
      - audiocaps_gt_captions
      - gt_caption
      - gt_captions
      - caption
      - best_model_w_meta_pred_caption
      - gt_audio_caption
      - wavcaps_caption
    tags:
      - keywords
      - tags


step:
  validation_every_n_epochs: 3
  save_checkpoint_every_n_steps: 1500
  # limit_val_batches: 1 # TODO: enable for test
  # limit_train_batches: 128 # TODO: enable for test
  max_steps: *mx_steps
  save_top_k: -1

preprocessing:
  video:
      fps : 1
      height: 224
      width: 224
  audio:
    sampling_rate: *sampling_rate
    max_wav_value: 32768.0
    duration: 10.24
  stft:
    filter_length: 1024
    hop_length: 160
    win_length: 1024
  mel:
    n_mel_channels: *mel_bins
    mel_fmin: 0
    mel_fmax: 8000 

augmentation:
  mixup: 0.0

model:
  target: src.models.genau_ddpm.GenAu
  params: 
    # dataset token
    dataset_embed_dim: 32
    dataset2id:
      audiocaps: 0
      clotho: 1
      vggsounds: 2
      wavcaps_audioset_strong: 3
      wavcaps_bbcsound: 4
      wavcaps_freesound: 5
      wavcaps_soundbible: 6
      fsd50k: 7
      caption_audioset: 8


    # logging 
    validate_uncond: False
    validate_wo_ema: True
    num_val_sampled_timestamps: 10

    # evaluation # disable evaluation
    # evaluator:
    #   target: audioldm_eval.EvaluationHelper
    #   params: 
    #     sampling_rate: 16000
    #     device: 'cuda'

    # Optimizer
    optimizer_config:
      # Which optimizer to use
      target: !module src.modules.optimizers.lamb.Lamb
      # Which LR to use 
      lr: *lr
      # The weight decay to use
      weight_decay: 0.01
      # Beta parameters for configs/experiments/getty_images_image_model/w480_debug.yaml
      betas: [0.9,0.99]
      # Eps parameter for Adam
      eps: 0.00000001      

    base_learning_rate: *lr
    # Final lr for cosine annealing
    final_lr: 0.0015  # Use cosine lr scheduling but do not reach 0 as performance degrade with very small lr
    # Number of warmup steps
    warmup_steps: *warmup_steps
    # Number of steps between each lr update
    lr_update_each_steps: 10
    # Total number of training steps
    max_steps: *mx_steps # TODO enable

    # Autoencoder
    first_stage_config:
      base_learning_rate: 8.0e-06
      target: src.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
      params: 
        # reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
        reload_from_ckpt: "1dvae_64ch_16k_64bins"
        sampling_rate: *sampling_rate
        batchsize: *bs # TODO: chagne 
        monitor: val/rec_loss
        image_key: fbank
        subband: 1
        embed_dim: *latent_embed_dim
        time_shuffle: 1
        lossconfig:
          target: src.losses.LPIPSWithDiscriminator
          params:
            disc_start: 50001
            kl_weight: 1000.0
            disc_weight: 0.5
            disc_in_channels: 1
        ddconfig: 
          double_z: true
          mel_bins: *mel_bins # The frequency bins of mel spectrogram
          z_channels: *unet_in_channels
          resolution: 256
          downsample_time: false
          in_channels: 64
          out_ch: 64 # in and out channels must stay as 64
          ch: 512 
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 3
          attn_resolutions: []
          dropout: 0.0
      
    # Other parameters
    clip_grad: 0.5
    optimize_ddpm_parameter: *optimize_ddpm_parameter
    sampling_rate: *sampling_rate
    batchsize: *bs
    linear_start: 0.0015 # in DDPM, a linear scheduler is used from 1e-4 to 0.2. LDM uses linera scheduler with same params. Make-an-audio uses different start and end values. Improved DDPM introduced coise and RIN introduced sigmoid one.
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    unconditional_prob_cfg: 0.1
    parameterization: eps # [eps, x0, v]
    first_stage_key: fbank
    latent_t_size: *latent_t_size # TODO might need to change
    latent_f_size: *latent_f_size
    channels: *latent_embed_dim # TODO might need to change
    monitor: val/loss_simple_ema
    
    scale_by_std: True
    # scale_factor: 1.0144787

    
    backbone_type : fit
    unet_config:
      target: src.modules.fit.fit_audio.FIT

      params:
        weight_initializer:
          target: !module src.modules.initializers.initializers.RINWeightScalerInitializer
          scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings

        fit_block_module: !module src.modules.fit.layers.fit_layers.FITBlockV5
        context_channels: 1024
        summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size

        # If True inserts the conditioning information in the context
        conditioning_in_context: True

        # The type of positional encodings to use for the time input
        time_pe_type: learned
        # Uses a label that specifies whether the current input is a video or an image
        use_video_image_conditioning: False
        # Uses a label that specifies the framerate of the current video
        use_framerate_conditioning: False
        # Uses a label that specifies the id of the dataset from which the current input comes
        use_dataset_id_conditioning: True
        # Uses a label that specifies the resolution of the current input
        use_resolution_conditioning: False
        # If True uses the unmasked parts of the denoised input as conditioning
        use_denoised_input_conditioning: False

        # Size of the input in pixels
        input_size: [1, *latent_t_size, *latent_f_size]  # (frames_count, height, widht)
        # The size in pixels of each patch
        patch_size: [1, 1, 1]
        # The number of patches in each group
        group_size: [1, 32, 1]
        input_channels: *latent_embed_dim
        # The number of channels in the patch embeddings
        patch_channels: 1024
        # The number of fit blocks
        fit_blocks_count: 6
        # The number of local layers in each fit block
        local_layers_per_block: 2
        # The number of global layers in each fit block
        global_layers_per_block: 4
        # The number of latent tokens
        latent_count: 256
        # The number of channels in the latent tokens
        latent_channels: 1536

        self_conditioning_ff_config: {}
        fit_block_config:
          attention_class: !module src.modules.fit.layers.rin_layers.Attention
          ff_class: !module src.modules.fit.layers.rin_layers.FeedForward
          
          # Dropout parameters
          drop_units: 0.1
          drop_path: 0.0

          # Whether to use feedforward layers after corss attention
          use_cross_attention_feedforward: True
          
          # Configuration for attention layers
          default_attention_config:
            heads: 8
            dim_head: 128
          read_attention_config:
            # Ensure heads * dim_head = min(input_channels, patch_channels)
            heads: 8
            dim_head: 128
          read_context_attention_config:
            # Ensure heads * dim_head = min(latent_channels, context_channels)
            heads: 8
            dim_head: 128
          read_latent_conditioning_attention_config:
            # Ensure heads * dim_head = latent_channels
            heads: 12
            dim_head: 128
          write_attention_config:
            # Ensure heads * dim_head = min(input_channels, patch_channels)
            heads: 8
            dim_head: 128
          local_attention_config:
            # Ensure heads * dim_head = patch_channels
            heads: 8
            dim_head: 128
          global_attention_config:
            # Ensure heads * dim_head = latent_channels
            heads: 12
            dim_head: 128
          
          ff_config: {}
    # unet_config:
    #   target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel
    #   params:
    #     image_size: 64 
    #     extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
    #     # context_dim: 
    #     # - 768
    #     in_channels: *unet_in_channels # The input channel of the UNet model
    #     out_channels: *latent_embed_dim # TODO might need to change
    #     model_channels: 128 # TODO might need to change
    #     attention_resolutions:
    #     - 8
    #     - 4
    #     - 2
    #     num_res_blocks: 2
    #     channel_mult: 
    #     - 1
    #     - 2
    #     - 3
    #     - 5
    #     num_head_channels: 32
    #     use_spatial_transformer: true
    #     transformer_depth: 1
    #     extra_sa_layer: false
    
    cond_stage_config:
      film_clap_cond1:
        cond_stage_key: text
        conditioning_key: film
        target: src.modules.conditional.conditional_models.CLAPAudioEmbeddingClassifierFreev2
        params:
          pretrained_path: clap_htsat_tiny
          sampling_rate: 16000
          embed_mode: text # or text
          amodel: HTSAT-tiny
      film_flan_t5_cond2:
        cond_stage_key: text
        conditioning_key: film
        target: src.modules.conditional.conditional_models.FlanT5HiddenState
        params:
          text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
          freeze_text_encoder: True
          return_embeds: True
          pool_tokens: True
        
      noncond_dataset_ids: # for none_fit backbone, please use film_dataset_ids and enable encode_dataset_ids
        cond_stage_key: all
        conditioning_key: ignore
        target: src.modules.conditional.conditional_models.DatasetIDs
        params:
          encode_dataset_ids: False
          dataset2id:
            audiocaps: 0
            clotho: 1
            vggsounds: 2
            wavcaps_audioset_strong: 3
            wavcaps_bbcsound: 4
            wavcaps_freesound: 5
            wavcaps_soundbible: 6
            fsd50k: 7
            caption_audioset: 8
            unconditional: 0 # set the uncondtional to 0 for future experiments



    evaluation_params:
      unconditional_guidance_scale: 3.5
      ddim_sampling_steps: 200
      n_candidates_per_samples: 3