training: precision: "high" nodes_count: -1 logging: project_name: "audioldm-snap" wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe log_directory: "./run_logs/genau/train" # Saving Checkpoints # if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely S3_BUCKET: "snap-genvid" S3_FOLDER: 'mali6/audioldm' save_checkpoint_every_n_steps: 1500 save_top_k: -1 variables: sampling_rate: &sampling_rate 16000 mel_bins: &mel_bins 64 latent_embed_dim: &latent_embed_dim 64 latent_t_size: &latent_t_size 256 # TODO might need to change latent_f_size: &latent_f_size 1 in_channels: &unet_in_channels 256 optimize_ddpm_parameter: &optimize_ddpm_parameter true optimize_gpt: &optimize_gpt true warmup_steps: &warmup_steps 5000 lr: &lr 5.0e-3 mx_steps: &mx_steps 8000000 batch_size: &bs 36 # TODO: change to 256 data: train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k'] val: "autocap" test: "autocap" class_label_indices: "audioset_eval_subset" dataloader_add_ons: [] augment_p : 0.0 num_workers: 48 consistent_start_time: True keys_synonyms: gt_audio_caption: - audiocaps_gt_captions - gt_caption - gt_captions - caption - best_model_w_meta_pred_caption - gt_audio_caption - autocap_caption - wavcaps_caption tags: - keywords - tags step: validation_every_n_epochs: 50 save_checkpoint_every_n_steps: 2500 # limit_val_batches: 4 # TODO: enable for test # limit_train_batches: 1 # TODO: enable for test max_steps: *mx_steps save_top_k: -1 preprocessing: video: fps : 1 height: 224 width: 224 audio: sampling_rate: *sampling_rate max_wav_value: 32768.0 duration: 10.24 stft: filter_length: 1024 hop_length: 160 win_length: 1024 mel: n_mel_channels: *mel_bins mel_fmin: 0 mel_fmax: 8000 augmentation: mixup: 0.0 model: target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion params: # dataset token dataset_embed_dim: 32 # logging log_uncond: False validation_wo_ema: True num_val_sampled_timestamps: 10 # # evaluation # evaluator: # target: audioldm_eval.EvaluationHelper # params: # sampling_rate: 16000 # device: 'cuda' # Optimizer optimizer_config: # Which optimizer to use target: !module audioldm_train.modules.snapvideo.training.optimizers.lamb.Lamb # Which LR to use lr: *lr # The weight decay to use weight_decay: 0.01 # Beta parameters for configs/experiments/getty_images_image_model/w480_debug.yaml betas: [0.9,0.99] # Eps parameter for Adam eps: 0.00000001 base_learning_rate: *lr # Final lr for cosine annealing final_lr: 0.0015 # Use cosine lr scheduling but do not reach 0 as performance degrade with very small lr # Number of warmup steps warmup_steps: *warmup_steps # Number of steps between each lr update lr_update_each_steps: 10 # Total number of training steps max_steps: *mx_steps # TODO enable # Autoencoder first_stage_config: base_learning_rate: 8.0e-06 target: audioldm_train.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D params: # reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt" reload_from_ckpt: "/fsx/mali6/repos/AudioLDM2-training/log/vae_checkpoints/vae_64hdcheckpoint-344999.ckpt" sampling_rate: *sampling_rate batchsize: *bs # TODO: chagne monitor: val/rec_loss image_key: fbank subband: 1 embed_dim: *latent_embed_dim time_shuffle: 1 lossconfig: target: audioldm_train.losses.LPIPSWithDiscriminator params: disc_start: 50001 kl_weight: 1000.0 disc_weight: 0.5 disc_in_channels: 1 ddconfig: double_z: true mel_bins: *mel_bins # The frequency bins of mel spectrogram z_channels: *unet_in_channels resolution: 256 downsample_time: false in_channels: 64 out_ch: 64 # in and out channels must stay as 64 ch: 512 ch_mult: - 1 - 2 - 4 num_res_blocks: 3 attn_resolutions: [] dropout: 0.0 # Other parameters clip_grad: 0.5 optimize_ddpm_parameter: *optimize_ddpm_parameter sampling_rate: *sampling_rate batchsize: *bs linear_start: 0.0015 # in DDPM, a linear scheduler is used from 1e-4 to 0.2. LDM uses linera scheduler with same params. Make-an-audio uses different start and end values. Improved DDPM introduced coise and RIN introduced sigmoid one. linear_end: 0.0195 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 unconditional_prob_cfg: 0.1 parameterization: eps # [eps, x0, v] first_stage_key: fbank latent_t_size: *latent_t_size # TODO might need to change latent_f_size: *latent_f_size channels: *latent_embed_dim # TODO might need to change monitor: val/loss_simple_ema scale_by_std: True # scale_factor: 1.0144787 backbone_type : fit unet_config: target: audioldm_train.modules.snapvideo.models.vision.backbones.fit_audio.FIT params: weight_initializer: target: !module audioldm_train.modules.snapvideo.models.initializers.rin_weight_scaler_initializer.RINWeightScalerInitializer scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings fit_block_module: !module audioldm_train.modules.snapvideo.models.vision.layers.fit_block_v5.FITBlockV5 context_channels: 1024 summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size # If True inserts the conditioning information in the context conditioning_in_context: True # The type of positional encodings to use for the time input time_pe_type: learned # Uses a label that specifies the framerate of the current video use_dataset_id_conditioning: True # Uses a label that specifies the resolution of the current input use_resolution_conditioning: False # Size of the input in pixels input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht) # The size in pixels of each patch patch_size: [1, 1, 1] # The number of patches in each group group_size: [1, 32, 1] input_channels: *latent_embed_dim # The number of channels in the patch embeddings patch_channels: 1024 # The number of fit blocks fit_blocks_count: 4 # The number of local layers in each fit block local_layers_per_block: 2 # The number of global layers in each fit block global_layers_per_block: 4 # The number of latent tokens latent_count: 256 # The number of channels in the latent tokens latent_channels: 1024 self_conditioning_ff_config: {} fit_block_config: attention_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.Attention ff_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.FeedForward # Dropout parameters drop_units: 0.1 drop_path: 0.0 # Whether to use feedforward layers after corss attention use_cross_attention_feedforward: True # Configuration for attention layers default_attention_config: heads: 8 dim_head: 128 read_attention_config: # Ensure heads * dim_head = min(input_channels, patch_channels) heads: 8 dim_head: 128 read_context_attention_config: # Ensure heads * dim_head = min(latent_channels, context_channels) heads: 8 dim_head: 128 read_latent_conditioning_attention_config: # Ensure heads * dim_head = latent_channels heads: 8 dim_head: 128 write_attention_config: # Ensure heads * dim_head = min(input_channels, patch_channels) heads: 8 dim_head: 128 local_attention_config: # Ensure heads * dim_head = patch_channels heads: 8 dim_head: 128 global_attention_config: # Ensure heads * dim_head = latent_channels heads: 8 dim_head: 128 ff_config: {} # unet_config: # target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel # params: # image_size: 64 # extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024 # # context_dim: # # - 768 # in_channels: *unet_in_channels # The input channel of the UNet model # out_channels: *latent_embed_dim # TODO might need to change # model_channels: 128 # TODO might need to change # attention_resolutions: # - 8 # - 4 # - 2 # num_res_blocks: 2 # channel_mult: # - 1 # - 2 # - 3 # - 5 # num_head_channels: 32 # use_spatial_transformer: true # transformer_depth: 1 # extra_sa_layer: false cond_stage_config: film_clap_cond1: cond_stage_key: text conditioning_key: film target: audioldm_train.conditional_models.CLAPAudioEmbeddingClassifierFreev2 params: pretrained_path: data/checkpoints/clap_htsat_tiny.pt sampling_rate: 16000 embed_mode: text # or text amodel: HTSAT-tiny film_flan_t5_cond2: cond_stage_key: text conditioning_key: film target: audioldm_train.conditional_models.FlanT5HiddenState params: text_encoder_name: google/flan-t5-large # google/flan-t5-xxl freeze_text_encoder: True return_embeds: True pool_tokens: True noncond_dataset_ids: # for none_fit backbone, please use film_dataset_ids and enable encode_dataset_ids cond_stage_key: all conditioning_key: ignore target: src.modules.conditional.conditional_models.DatasetIDs params: encode_dataset_ids: False dataset2id: audiocaps: 0 clotho: 1 vggsounds: 2 wavcaps_audioset_strong: 3 wavcaps_bbcsound: 4 wavcaps_freesound: 5 wavcaps_soundbible: 6 fsd50k: 7 caption_audioset: 8 autocap: 9 unconditional: 0 # set the uncondtional to 0 for future experiments evaluation_params: unconditional_guidance_scale: 3.5 ddim_sampling_steps: 200 n_candidates_per_samples: 3