autocap / genau-full-s.yaml
mali6's picture
Upload genau-full-s.yaml with huggingface_hub
720b7a8 verified
raw
history blame
11.3 kB
training:
precision: "high"
nodes_count: -1
logging:
project_name: "audioldm-snap"
wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
log_directory: "./run_logs/genau/train"
# Saving Checkpoints
# if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
S3_BUCKET: "snap-genvid"
S3_FOLDER: 'mali6/audioldm'
save_checkpoint_every_n_steps: 1500
save_top_k: -1
variables:
sampling_rate: &sampling_rate 16000
mel_bins: &mel_bins 64
latent_embed_dim: &latent_embed_dim 64
latent_t_size: &latent_t_size 256 # TODO might need to change
latent_f_size: &latent_f_size 1
in_channels: &unet_in_channels 256
optimize_ddpm_parameter: &optimize_ddpm_parameter true
optimize_gpt: &optimize_gpt true
warmup_steps: &warmup_steps 5000
lr: &lr 5.0e-3
mx_steps: &mx_steps 8000000
batch_size: &bs 36 # TODO: change to 256
data:
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
val: "autocap"
test: "autocap"
class_label_indices: "audioset_eval_subset"
dataloader_add_ons: []
augment_p : 0.0
num_workers: 48
consistent_start_time: True
keys_synonyms:
gt_audio_caption:
- audiocaps_gt_captions
- gt_caption
- gt_captions
- caption
- best_model_w_meta_pred_caption
- gt_audio_caption
- autocap_caption
- wavcaps_caption
tags:
- keywords
- tags
step:
validation_every_n_epochs: 50
save_checkpoint_every_n_steps: 2500
# limit_val_batches: 4 # TODO: enable for test
# limit_train_batches: 1 # TODO: enable for test
max_steps: *mx_steps
save_top_k: -1
preprocessing:
video:
fps : 1
height: 224
width: 224
audio:
sampling_rate: *sampling_rate
max_wav_value: 32768.0
duration: 10.24
stft:
filter_length: 1024
hop_length: 160
win_length: 1024
mel:
n_mel_channels: *mel_bins
mel_fmin: 0
mel_fmax: 8000
augmentation:
mixup: 0.0
model:
target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
params:
# dataset token
dataset_embed_dim: 32
# logging
log_uncond: False
validation_wo_ema: True
num_val_sampled_timestamps: 10
# # evaluation
# evaluator:
# target: audioldm_eval.EvaluationHelper
# params:
# sampling_rate: 16000
# device: 'cuda'
# Optimizer
optimizer_config:
# Which optimizer to use
target: !module audioldm_train.modules.snapvideo.training.optimizers.lamb.Lamb
# Which LR to use
lr: *lr
# The weight decay to use
weight_decay: 0.01
# Beta parameters for configs/experiments/getty_images_image_model/w480_debug.yaml
betas: [0.9,0.99]
# Eps parameter for Adam
eps: 0.00000001
base_learning_rate: *lr
# Final lr for cosine annealing
final_lr: 0.0015 # Use cosine lr scheduling but do not reach 0 as performance degrade with very small lr
# Number of warmup steps
warmup_steps: *warmup_steps
# Number of steps between each lr update
lr_update_each_steps: 10
# Total number of training steps
max_steps: *mx_steps # TODO enable
# Autoencoder
first_stage_config:
base_learning_rate: 8.0e-06
target: audioldm_train.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
params:
# reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
reload_from_ckpt: "/fsx/mali6/repos/AudioLDM2-training/log/vae_checkpoints/vae_64hdcheckpoint-344999.ckpt"
sampling_rate: *sampling_rate
batchsize: *bs # TODO: chagne
monitor: val/rec_loss
image_key: fbank
subband: 1
embed_dim: *latent_embed_dim
time_shuffle: 1
lossconfig:
target: audioldm_train.losses.LPIPSWithDiscriminator
params:
disc_start: 50001
kl_weight: 1000.0
disc_weight: 0.5
disc_in_channels: 1
ddconfig:
double_z: true
mel_bins: *mel_bins # The frequency bins of mel spectrogram
z_channels: *unet_in_channels
resolution: 256
downsample_time: false
in_channels: 64
out_ch: 64 # in and out channels must stay as 64
ch: 512
ch_mult:
- 1
- 2
- 4
num_res_blocks: 3
attn_resolutions: []
dropout: 0.0
# Other parameters
clip_grad: 0.5
optimize_ddpm_parameter: *optimize_ddpm_parameter
sampling_rate: *sampling_rate
batchsize: *bs
linear_start: 0.0015 # in DDPM, a linear scheduler is used from 1e-4 to 0.2. LDM uses linera scheduler with same params. Make-an-audio uses different start and end values. Improved DDPM introduced coise and RIN introduced sigmoid one.
linear_end: 0.0195
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
unconditional_prob_cfg: 0.1
parameterization: eps # [eps, x0, v]
first_stage_key: fbank
latent_t_size: *latent_t_size # TODO might need to change
latent_f_size: *latent_f_size
channels: *latent_embed_dim # TODO might need to change
monitor: val/loss_simple_ema
scale_by_std: True
# scale_factor: 1.0144787
backbone_type : fit
unet_config:
target: audioldm_train.modules.snapvideo.models.vision.backbones.fit_audio.FIT
params:
weight_initializer:
target: !module audioldm_train.modules.snapvideo.models.initializers.rin_weight_scaler_initializer.RINWeightScalerInitializer
scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
fit_block_module: !module audioldm_train.modules.snapvideo.models.vision.layers.fit_block_v5.FITBlockV5
context_channels: 1024
summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
# If True inserts the conditioning information in the context
conditioning_in_context: True
# The type of positional encodings to use for the time input
time_pe_type: learned
# Uses a label that specifies the framerate of the current video
use_dataset_id_conditioning: True
# Uses a label that specifies the resolution of the current input
use_resolution_conditioning: False
# Size of the input in pixels
input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
# The size in pixels of each patch
patch_size: [1, 1, 1]
# The number of patches in each group
group_size: [1, 32, 1]
input_channels: *latent_embed_dim
# The number of channels in the patch embeddings
patch_channels: 1024
# The number of fit blocks
fit_blocks_count: 4
# The number of local layers in each fit block
local_layers_per_block: 2
# The number of global layers in each fit block
global_layers_per_block: 4
# The number of latent tokens
latent_count: 256
# The number of channels in the latent tokens
latent_channels: 1024
self_conditioning_ff_config: {}
fit_block_config:
attention_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.Attention
ff_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.FeedForward
# Dropout parameters
drop_units: 0.1
drop_path: 0.0
# Whether to use feedforward layers after corss attention
use_cross_attention_feedforward: True
# Configuration for attention layers
default_attention_config:
heads: 8
dim_head: 128
read_attention_config:
# Ensure heads * dim_head = min(input_channels, patch_channels)
heads: 8
dim_head: 128
read_context_attention_config:
# Ensure heads * dim_head = min(latent_channels, context_channels)
heads: 8
dim_head: 128
read_latent_conditioning_attention_config:
# Ensure heads * dim_head = latent_channels
heads: 8
dim_head: 128
write_attention_config:
# Ensure heads * dim_head = min(input_channels, patch_channels)
heads: 8
dim_head: 128
local_attention_config:
# Ensure heads * dim_head = patch_channels
heads: 8
dim_head: 128
global_attention_config:
# Ensure heads * dim_head = latent_channels
heads: 8
dim_head: 128
ff_config: {}
# unet_config:
# target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel
# params:
# image_size: 64
# extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
# # context_dim:
# # - 768
# in_channels: *unet_in_channels # The input channel of the UNet model
# out_channels: *latent_embed_dim # TODO might need to change
# model_channels: 128 # TODO might need to change
# attention_resolutions:
# - 8
# - 4
# - 2
# num_res_blocks: 2
# channel_mult:
# - 1
# - 2
# - 3
# - 5
# num_head_channels: 32
# use_spatial_transformer: true
# transformer_depth: 1
# extra_sa_layer: false
cond_stage_config:
film_clap_cond1:
cond_stage_key: text
conditioning_key: film
target: audioldm_train.conditional_models.CLAPAudioEmbeddingClassifierFreev2
params:
pretrained_path: data/checkpoints/clap_htsat_tiny.pt
sampling_rate: 16000
embed_mode: text # or text
amodel: HTSAT-tiny
film_flan_t5_cond2:
cond_stage_key: text
conditioning_key: film
target: audioldm_train.conditional_models.FlanT5HiddenState
params:
text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
freeze_text_encoder: True
return_embeds: True
pool_tokens: True
noncond_dataset_ids: # for none_fit backbone, please use film_dataset_ids and enable encode_dataset_ids
cond_stage_key: all
conditioning_key: ignore
target: src.modules.conditional.conditional_models.DatasetIDs
params:
encode_dataset_ids: False
dataset2id:
audiocaps: 0
clotho: 1
vggsounds: 2
wavcaps_audioset_strong: 3
wavcaps_bbcsound: 4
wavcaps_freesound: 5
wavcaps_soundbible: 6
fsd50k: 7
caption_audioset: 8
autocap: 9
unconditional: 0 # set the uncondtional to 0 for future experiments
evaluation_params:
unconditional_guidance_scale: 3.5
ddim_sampling_steps: 200
n_candidates_per_samples: 3