|
log_directory: "./log/latent_diffusion" |
|
project: "audioldm" |
|
precision: "high" |
|
|
|
|
|
base_root: "/content/qa-mdt" |
|
|
|
|
|
|
|
pretrained: |
|
clap_music: "./qa-mdt/checkpoints/clap_music" |
|
flan_t5: "./qa-mdt/checkpoints/flant5" |
|
hifi-gan: "./qa-mdt/checkpoints/hifi-gan/checkpoints" |
|
roberta-base: "./qa-mdt/checkpoints/robertabase" |
|
|
|
|
|
|
|
|
|
|
|
mos_path: "" |
|
|
|
train_path: |
|
train_lmdb_path: [] |
|
|
|
val_path: |
|
val_lmdb_path: [] |
|
val_key_path: [] |
|
|
|
variables: |
|
sampling_rate: &sampling_rate 16000 |
|
mel_bins: &mel_bins 64 |
|
latent_embed_dim: &latent_embed_dim 8 |
|
latent_t_size: &latent_t_size 256 |
|
latent_f_size: &latent_f_size 16 |
|
in_channels: &unet_in_channels 8 |
|
optimize_ddpm_parameter: &optimize_ddpm_parameter true |
|
optimize_gpt: &optimize_gpt true |
|
warmup_steps: &warmup_steps 2000 |
|
|
|
|
|
data: |
|
train: ["audiocaps"] |
|
val: "audiocaps" |
|
test: "audiocaps" |
|
class_label_indices: "audioset_eval_subset" |
|
dataloader_add_ons: ["waveform_rs_48k"] |
|
|
|
step: |
|
validation_every_n_epochs: 10000 |
|
save_checkpoint_every_n_steps: 1000 |
|
|
|
max_steps: 8000000 |
|
save_top_k: 1000 |
|
|
|
preprocessing: |
|
audio: |
|
sampling_rate: *sampling_rate |
|
max_wav_value: 32768.0 |
|
duration: 10.24 |
|
stft: |
|
filter_length: 1024 |
|
hop_length: 160 |
|
win_length: 1024 |
|
mel: |
|
n_mel_channels: *mel_bins |
|
mel_fmin: 0 |
|
mel_fmax: 8000 |
|
|
|
augmentation: |
|
mixup: 0.0 |
|
|
|
model: |
|
target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion |
|
params: |
|
|
|
first_stage_config: |
|
base_learning_rate: 8.0e-06 |
|
target: audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL |
|
params: |
|
|
|
reload_from_ckpt: "./qa-mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt" |
|
sampling_rate: *sampling_rate |
|
batchsize: 1 |
|
monitor: val/rec_loss |
|
image_key: fbank |
|
subband: 1 |
|
embed_dim: *latent_embed_dim |
|
time_shuffle: 1 |
|
lossconfig: |
|
target: audioldm_train.losses.LPIPSWithDiscriminator |
|
params: |
|
disc_start: 50001 |
|
kl_weight: 1000.0 |
|
disc_weight: 0.5 |
|
disc_in_channels: 1 |
|
ddconfig: |
|
double_z: true |
|
mel_bins: *mel_bins |
|
z_channels: 8 |
|
resolution: 256 |
|
downsample_time: false |
|
in_channels: 1 |
|
out_ch: 1 |
|
ch: 128 |
|
ch_mult: |
|
- 1 |
|
- 2 |
|
- 4 |
|
num_res_blocks: 2 |
|
attn_resolutions: [] |
|
dropout: 0.0 |
|
|
|
|
|
base_learning_rate: 8.0e-5 |
|
warmup_steps: *warmup_steps |
|
optimize_ddpm_parameter: *optimize_ddpm_parameter |
|
sampling_rate: *sampling_rate |
|
batchsize: 16 |
|
linear_start: 0.0015 |
|
linear_end: 0.0195 |
|
num_timesteps_cond: 1 |
|
log_every_t: 200 |
|
timesteps: 1000 |
|
unconditional_prob_cfg: 0.1 |
|
parameterization: eps |
|
first_stage_key: fbank |
|
latent_t_size: *latent_t_size |
|
latent_f_size: *latent_f_size |
|
channels: *latent_embed_dim |
|
monitor: val/loss_simple_ema |
|
scale_by_std: true |
|
|
|
unet_config: |
|
|
|
|
|
target: audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN |
|
params: |
|
input_size : [256, 16] |
|
|
|
patch_size : [4, 1] |
|
overlap_size: [0, 0] |
|
in_channels : 8 |
|
hidden_size : 1152 |
|
depth : 28 |
|
num_heads : 16 |
|
mlp_ratio : 4.0 |
|
class_dropout_prob : 0.1 |
|
pred_sigma : True |
|
drop_path : 0. |
|
window_size : 0 |
|
window_block_indexes : None |
|
use_rel_pos : False |
|
cond_dim : 1024 |
|
lewei_scale : 1.0 |
|
overlap: [0, 0] |
|
use_cfg: true |
|
mask_ratio: 0.30 |
|
decode_layer: 8 |
|
|
|
cond_stage_config: |
|
crossattn_flan_t5: |
|
cond_stage_key: text |
|
conditioning_key: crossattn |
|
target: audioldm_train.conditional_models.FlanT5HiddenState |
|
|
|
evaluation_params: |
|
unconditional_guidance_scale: 3.5 |
|
ddim_sampling_steps: 200 |
|
n_candidates_per_samples: 3 |