|
|
|
training: |
|
precision: "high" |
|
nodes_count: -1 |
|
|
|
|
|
logging: |
|
project_name: "genau" |
|
wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize) |
|
log_directory: "./run_logs/genau/train" |
|
|
|
|
|
|
|
|
|
|
|
save_checkpoint_every_n_steps: 1500 |
|
save_top_k: -1 |
|
|
|
variables: |
|
sampling_rate: &sampling_rate 16000 |
|
mel_bins: &mel_bins 64 |
|
latent_embed_dim: &latent_embed_dim 64 |
|
latent_t_size: &latent_t_size 256 |
|
latent_f_size: &latent_f_size 1 |
|
in_channels: &unet_in_channels 256 |
|
optimize_ddpm_parameter: &optimize_ddpm_parameter true |
|
optimize_gpt: &optimize_gpt true |
|
warmup_steps: &warmup_steps 5000 |
|
lr: &lr 5.0e-3 |
|
mx_steps: &mx_steps 8000000 |
|
batch_size: &bs 36 |
|
|
|
|
|
data: |
|
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k'] |
|
val: "audioset" |
|
test: "audioset" |
|
class_label_indices: "audioset_eval_subset" |
|
dataloader_add_ons: [] |
|
augment_p : 0.0 |
|
num_workers: 48 |
|
consistent_start_time: True |
|
|
|
keys_synonyms: |
|
gt_audio_caption: |
|
- audiocaps_gt_captions |
|
- gt_caption |
|
- gt_captions |
|
- caption |
|
- best_model_w_meta_pred_caption |
|
- gt_audio_caption |
|
- autocap_caption |
|
- wavcaps_caption |
|
tags: |
|
- keywords |
|
- tags |
|
|
|
|
|
step: |
|
validation_every_n_epochs: 50 |
|
save_checkpoint_every_n_steps: 2500 |
|
|
|
|
|
max_steps: *mx_steps |
|
save_top_k: -1 |
|
|
|
preprocessing: |
|
video: |
|
fps : 1 |
|
height: 224 |
|
width: 224 |
|
audio: |
|
sampling_rate: *sampling_rate |
|
max_wav_value: 32768.0 |
|
duration: 10.24 |
|
stft: |
|
filter_length: 1024 |
|
hop_length: 160 |
|
win_length: 1024 |
|
mel: |
|
n_mel_channels: *mel_bins |
|
mel_fmin: 0 |
|
mel_fmax: 8000 |
|
|
|
augmentation: |
|
mixup: 0.0 |
|
|
|
model: |
|
target: src.models.genau_ddpm.GenAu |
|
params: |
|
|
|
dataset_embed_dim: 32 |
|
|
|
|
|
validate_uncond: False |
|
validate_wo_ema: True |
|
num_val_sampled_timestamps: 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
optimizer_config: |
|
|
|
target: !module src.modules.optimizers.lamb.Lamb |
|
|
|
lr: *lr |
|
|
|
weight_decay: 0.01 |
|
|
|
betas: [0.9,0.99] |
|
|
|
eps: 0.00000001 |
|
|
|
base_learning_rate: *lr |
|
|
|
final_lr: 0.0015 |
|
|
|
warmup_steps: *warmup_steps |
|
|
|
lr_update_each_steps: 10 |
|
|
|
max_steps: *mx_steps |
|
|
|
|
|
first_stage_config: |
|
base_learning_rate: 8.0e-06 |
|
target: src.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D |
|
|
|
params: |
|
|
|
reload_from_ckpt: "1dvae_64ch_16k_64bins" |
|
sampling_rate: *sampling_rate |
|
batchsize: *bs |
|
monitor: val/rec_loss |
|
image_key: fbank |
|
subband: 1 |
|
embed_dim: *latent_embed_dim |
|
time_shuffle: 1 |
|
|
|
lossconfig: |
|
target: src.losses.LPIPSWithDiscriminator |
|
params: |
|
disc_start: 50001 |
|
kl_weight: 1000.0 |
|
disc_weight: 0.5 |
|
disc_in_channels: 1 |
|
ddconfig: |
|
double_z: true |
|
mel_bins: *mel_bins |
|
z_channels: *unet_in_channels |
|
resolution: 256 |
|
downsample_time: false |
|
in_channels: 64 |
|
out_ch: 64 |
|
ch: 512 |
|
ch_mult: |
|
- 1 |
|
- 2 |
|
- 4 |
|
num_res_blocks: 3 |
|
attn_resolutions: [] |
|
dropout: 0.0 |
|
|
|
|
|
clip_grad: 0.5 |
|
optimize_ddpm_parameter: *optimize_ddpm_parameter |
|
sampling_rate: *sampling_rate |
|
batchsize: *bs |
|
linear_start: 0.0015 |
|
linear_end: 0.0195 |
|
num_timesteps_cond: 1 |
|
log_every_t: 200 |
|
timesteps: 1000 |
|
unconditional_prob_cfg: 0.1 |
|
parameterization: eps |
|
first_stage_key: fbank |
|
latent_t_size: *latent_t_size |
|
latent_f_size: *latent_f_size |
|
channels: *latent_embed_dim |
|
monitor: val/loss_simple_ema |
|
|
|
scale_by_std: True |
|
|
|
|
|
|
|
backbone_type : fit |
|
unet_config: |
|
target: src.modules.fit.fit_audio.FIT |
|
|
|
params: |
|
weight_initializer: |
|
target: !module src.modules.initializers.initializers.RINWeightScalerInitializer |
|
scale: 0.57735026919 |
|
|
|
fit_block_module: !module src.modules.fit.layers.fit_layers.FITBlockV5 |
|
context_channels: 1024 |
|
summary_text_embeddings_channels: 1536 |
|
|
|
|
|
conditioning_in_context: True |
|
|
|
|
|
time_pe_type: learned |
|
|
|
use_dataset_id_conditioning: True |
|
|
|
use_resolution_conditioning: False |
|
|
|
|
|
input_size: [1, *latent_t_size, *latent_f_size] |
|
|
|
patch_size: [1, 1, 1] |
|
|
|
group_size: [1, 32, 1] |
|
input_channels: *latent_embed_dim |
|
|
|
patch_channels: 1024 |
|
|
|
fit_blocks_count: 4 |
|
|
|
local_layers_per_block: 2 |
|
|
|
global_layers_per_block: 4 |
|
|
|
latent_count: 256 |
|
|
|
latent_channels: 1024 |
|
|
|
self_conditioning_ff_config: {} |
|
fit_block_config: |
|
attention_class: !module src.modules.fit.layers.rin_layers.Attention |
|
ff_class: !module src.modules.fit.layers.rin_layers.FeedForward |
|
|
|
|
|
drop_units: 0.1 |
|
drop_path: 0.0 |
|
|
|
|
|
use_cross_attention_feedforward: True |
|
|
|
|
|
default_attention_config: |
|
heads: 8 |
|
dim_head: 128 |
|
read_attention_config: |
|
|
|
heads: 8 |
|
dim_head: 128 |
|
read_context_attention_config: |
|
|
|
heads: 8 |
|
dim_head: 128 |
|
read_latent_conditioning_attention_config: |
|
|
|
heads: 8 |
|
dim_head: 128 |
|
write_attention_config: |
|
|
|
heads: 8 |
|
dim_head: 128 |
|
local_attention_config: |
|
|
|
heads: 8 |
|
dim_head: 128 |
|
global_attention_config: |
|
|
|
heads: 8 |
|
dim_head: 128 |
|
|
|
ff_config: {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cond_stage_config: |
|
film_clap_cond1: |
|
cond_stage_key: text |
|
conditioning_key: film |
|
target: src.modules.conditional.conditional_models.CLAPAudioEmbeddingClassifierFreev2 |
|
params: |
|
pretrained_path: clap_htsat_tiny |
|
sampling_rate: 16000 |
|
embed_mode: text |
|
amodel: HTSAT-tiny |
|
film_flan_t5_cond2: |
|
cond_stage_key: text |
|
conditioning_key: film |
|
target: src.modules.conditional.conditional_models.FlanT5HiddenState |
|
params: |
|
text_encoder_name: google/flan-t5-large |
|
freeze_text_encoder: True |
|
return_embeds: True |
|
pool_tokens: True |
|
|
|
noncond_dataset_ids: |
|
cond_stage_key: all |
|
conditioning_key: ignore |
|
target: src.modules.conditional.conditional_models.DatasetIDs |
|
params: |
|
encode_dataset_ids: False |
|
dataset2id: |
|
audiocaps: 0 |
|
clotho: 1 |
|
vggsounds: 2 |
|
wavcaps_audioset_strong: 3 |
|
wavcaps_bbcsound: 4 |
|
wavcaps_freesound: 5 |
|
wavcaps_soundbible: 6 |
|
fsd50k: 7 |
|
caption_audioset: 8 |
|
autocap: 9 |
|
unconditional: 0 |
|
|
|
|
|
|
|
evaluation_params: |
|
unconditional_guidance_scale: 3.5 |
|
ddim_sampling_steps: 200 |
|
n_candidates_per_samples: 3 |
|
|
|
|
|
|
|
|