mali6
/

autocap

Model card Files Files and versions Community

mali6 commited on Jun 24, 2024

Commit

720b7a8

verified ·

1 Parent(s): 5c17464

Upload genau-full-s.yaml with huggingface_hub

Browse files

Files changed (1) hide show

genau-full-s.yaml +346 -0

genau-full-s.yaml ADDED Viewed

	@@ -0,0 +1,346 @@

+training:
+  precision: "high"
+  nodes_count: -1
+logging:
+  project_name: "audioldm-snap"
+  wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
+  log_directory: "./run_logs/genau/train"
+  # Saving Checkpoints
+  # if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
+  S3_BUCKET: "snap-genvid"
+  S3_FOLDER: 'mali6/audioldm'
+  save_checkpoint_every_n_steps: 1500
+  save_top_k: -1
+variables:
+  sampling_rate: &sampling_rate 16000
+  mel_bins: &mel_bins 64
+  latent_embed_dim: &latent_embed_dim 64
+  latent_t_size: &latent_t_size 256 # TODO might need to change
+  latent_f_size: &latent_f_size 1
+  in_channels: &unet_in_channels 256
+  optimize_ddpm_parameter: &optimize_ddpm_parameter true
+  optimize_gpt: &optimize_gpt true
+  warmup_steps: &warmup_steps 5000
+  lr: &lr 5.0e-3
+  mx_steps: &mx_steps 8000000
+  batch_size: &bs 36 # TODO: change to 256
+data:
+  train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
+  val: "autocap"
+  test: "autocap"
+  class_label_indices: "audioset_eval_subset"
+  dataloader_add_ons: []
+  augment_p : 0.0
+  num_workers: 48
+  consistent_start_time: True
+  keys_synonyms:
+    gt_audio_caption:
+      - audiocaps_gt_captions
+      - gt_caption
+      - gt_captions
+      - caption
+      - best_model_w_meta_pred_caption
+      - gt_audio_caption
+      - autocap_caption
+      - wavcaps_caption
+    tags:
+      - keywords
+      - tags
+step:
+  validation_every_n_epochs: 50
+  save_checkpoint_every_n_steps: 2500
+  # limit_val_batches: 4 # TODO: enable for test
+  # limit_train_batches: 1 # TODO: enable for test
+  max_steps: *mx_steps
+  save_top_k: -1
+preprocessing:
+  video:
+      fps : 1
+      height: 224
+      width: 224
+  audio:
+    sampling_rate: *sampling_rate
+    max_wav_value: 32768.0
+    duration: 10.24
+  stft:
+    filter_length: 1024
+    hop_length: 160
+    win_length: 1024
+  mel:
+    n_mel_channels: *mel_bins
+    mel_fmin: 0
+    mel_fmax: 8000
+augmentation:
+  mixup: 0.0
+model:
+  target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
+  params:
+    # dataset token
+    dataset_embed_dim: 32
+    # logging
+    log_uncond: False
+    validation_wo_ema: True
+    num_val_sampled_timestamps: 10
+    # # evaluation
+    # evaluator:
+    #   target: audioldm_eval.EvaluationHelper
+    #   params:
+    #     sampling_rate: 16000
+    #     device: 'cuda'
+    # Optimizer
+    optimizer_config:
+      # Which optimizer to use
+      target: !module audioldm_train.modules.snapvideo.training.optimizers.lamb.Lamb
+      # Which LR to use
+      lr: *lr
+      # The weight decay to use
+      weight_decay: 0.01
+      # Beta parameters for configs/experiments/getty_images_image_model/w480_debug.yaml
+      betas: [0.9,0.99]
+      # Eps parameter for Adam
+      eps: 0.00000001
+    base_learning_rate: *lr
+    # Final lr for cosine annealing
+    final_lr: 0.0015  # Use cosine lr scheduling but do not reach 0 as performance degrade with very small lr
+    # Number of warmup steps
+    warmup_steps: *warmup_steps
+    # Number of steps between each lr update
+    lr_update_each_steps: 10
+    # Total number of training steps
+    max_steps: *mx_steps # TODO enable
+    # Autoencoder
+    first_stage_config:
+      base_learning_rate: 8.0e-06
+      target: audioldm_train.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
+      params:
+        # reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
+        reload_from_ckpt: "/fsx/mali6/repos/AudioLDM2-training/log/vae_checkpoints/vae_64hdcheckpoint-344999.ckpt"
+        sampling_rate: *sampling_rate
+        batchsize: *bs # TODO: chagne
+        monitor: val/rec_loss
+        image_key: fbank
+        subband: 1
+        embed_dim: *latent_embed_dim
+        time_shuffle: 1
+        lossconfig:
+          target: audioldm_train.losses.LPIPSWithDiscriminator
+          params:
+            disc_start: 50001
+            kl_weight: 1000.0
+            disc_weight: 0.5
+            disc_in_channels: 1
+        ddconfig:
+          double_z: true
+          mel_bins: *mel_bins # The frequency bins of mel spectrogram
+          z_channels: *unet_in_channels
+          resolution: 256
+          downsample_time: false
+          in_channels: 64
+          out_ch: 64 # in and out channels must stay as 64
+          ch: 512
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 3
+          attn_resolutions: []
+          dropout: 0.0
+    # Other parameters
+    clip_grad: 0.5
+    optimize_ddpm_parameter: *optimize_ddpm_parameter
+    sampling_rate: *sampling_rate
+    batchsize: *bs
+    linear_start: 0.0015 # in DDPM, a linear scheduler is used from 1e-4 to 0.2. LDM uses linera scheduler with same params. Make-an-audio uses different start and end values. Improved DDPM introduced coise and RIN introduced sigmoid one.
+    linear_end: 0.0195
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    unconditional_prob_cfg: 0.1
+    parameterization: eps # [eps, x0, v]
+    first_stage_key: fbank
+    latent_t_size: *latent_t_size # TODO might need to change
+    latent_f_size: *latent_f_size
+    channels: *latent_embed_dim # TODO might need to change
+    monitor: val/loss_simple_ema
+    scale_by_std: True
+    # scale_factor: 1.0144787
+    backbone_type : fit
+    unet_config:
+      target: audioldm_train.modules.snapvideo.models.vision.backbones.fit_audio.FIT
+      params:
+        weight_initializer:
+          target: !module audioldm_train.modules.snapvideo.models.initializers.rin_weight_scaler_initializer.RINWeightScalerInitializer
+          scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
+        fit_block_module: !module audioldm_train.modules.snapvideo.models.vision.layers.fit_block_v5.FITBlockV5
+        context_channels: 1024
+        summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
+        # If True inserts the conditioning information in the context
+        conditioning_in_context: True
+        # The type of positional encodings to use for the time input
+        time_pe_type: learned
+        # Uses a label that specifies the framerate of the current video
+        use_dataset_id_conditioning: True
+        # Uses a label that specifies the resolution of the current input
+        use_resolution_conditioning: False
+        # Size of the input in pixels
+        input_size: [1, *latent_t_size, *latent_f_size]  # (frames_count, height, widht)
+        # The size in pixels of each patch
+        patch_size: [1, 1, 1]
+        # The number of patches in each group
+        group_size: [1, 32, 1]
+        input_channels: *latent_embed_dim
+        # The number of channels in the patch embeddings
+        patch_channels: 1024
+        # The number of fit blocks
+        fit_blocks_count: 4
+        # The number of local layers in each fit block
+        local_layers_per_block: 2
+        # The number of global layers in each fit block
+        global_layers_per_block: 4
+        # The number of latent tokens
+        latent_count: 256
+        # The number of channels in the latent tokens
+        latent_channels: 1024
+        self_conditioning_ff_config: {}
+        fit_block_config:
+          attention_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.Attention
+          ff_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.FeedForward
+          # Dropout parameters
+          drop_units: 0.1
+          drop_path: 0.0
+          # Whether to use feedforward layers after corss attention
+          use_cross_attention_feedforward: True
+          # Configuration for attention layers
+          default_attention_config:
+            heads: 8
+            dim_head: 128
+          read_attention_config:
+            # Ensure heads * dim_head = min(input_channels, patch_channels)
+            heads: 8
+            dim_head: 128
+          read_context_attention_config:
+            # Ensure heads * dim_head = min(latent_channels, context_channels)
+            heads: 8
+            dim_head: 128
+          read_latent_conditioning_attention_config:
+            # Ensure heads * dim_head = latent_channels
+            heads: 8
+            dim_head: 128
+          write_attention_config:
+            # Ensure heads * dim_head = min(input_channels, patch_channels)
+            heads: 8
+            dim_head: 128
+          local_attention_config:
+            # Ensure heads * dim_head = patch_channels
+            heads: 8
+            dim_head: 128
+          global_attention_config:
+            # Ensure heads * dim_head = latent_channels
+            heads: 8
+            dim_head: 128
+          ff_config: {}
+    # unet_config:
+    #   target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel
+    #   params:
+    #     image_size: 64
+    #     extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
+    #     # context_dim:
+    #     # - 768
+    #     in_channels: *unet_in_channels # The input channel of the UNet model
+    #     out_channels: *latent_embed_dim # TODO might need to change
+    #     model_channels: 128 # TODO might need to change
+    #     attention_resolutions:
+    #     - 8
+    #     - 4
+    #     - 2
+    #     num_res_blocks: 2
+    #     channel_mult:
+    #     - 1
+    #     - 2
+    #     - 3
+    #     - 5
+    #     num_head_channels: 32
+    #     use_spatial_transformer: true
+    #     transformer_depth: 1
+    #     extra_sa_layer: false
+    cond_stage_config:
+      film_clap_cond1:
+        cond_stage_key: text
+        conditioning_key: film
+        target: audioldm_train.conditional_models.CLAPAudioEmbeddingClassifierFreev2
+        params:
+          pretrained_path: data/checkpoints/clap_htsat_tiny.pt
+          sampling_rate: 16000
+          embed_mode: text # or text
+          amodel: HTSAT-tiny
+      film_flan_t5_cond2:
+        cond_stage_key: text
+        conditioning_key: film
+        target: audioldm_train.conditional_models.FlanT5HiddenState
+        params:
+          text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
+          freeze_text_encoder: True
+          return_embeds: True
+          pool_tokens: True
+      noncond_dataset_ids: # for none_fit backbone, please use film_dataset_ids and enable encode_dataset_ids
+        cond_stage_key: all
+        conditioning_key: ignore
+        target: src.modules.conditional.conditional_models.DatasetIDs
+        params:
+          encode_dataset_ids: False
+          dataset2id:
+            audiocaps: 0
+            clotho: 1
+            vggsounds: 2
+            wavcaps_audioset_strong: 3
+            wavcaps_bbcsound: 4
+            wavcaps_freesound: 5
+            wavcaps_soundbible: 6
+            fsd50k: 7
+            caption_audioset: 8
+            autocap: 9
+            unconditional: 0 # set the uncondtional to 0 for future experiments
+    evaluation_params:
+      unconditional_guidance_scale: 3.5
+      ddim_sampling_steps: 200
+      n_candidates_per_samples: 3