OpenSound's picture
Upload 7 files
cb4641a verified
raw
history blame
1.6 kB
model_name: EzAudio-L-Energy
model:
mae: True
mae_prob: 0.25
mask_ratio: [0.25, 1.0]
mask_span: 10
img_size: 500
patch_size: 1
in_chans: 257
out_chans: 128
input_type: '1d'
embed_dim: 1024
depth: 24
num_heads: 16
mlp_ratio: 4.0
qkv_bias: false
qk_scale: null
qk_norm: layernorm
norm_layer: layernorm
act_layer: geglu
context_norm: true
use_checkpoint: true
time_fusion: 'ada_lora_bias'
ada_lora_rank: 32
ada_lora_alpha: 32
cls_dim: null
context_dim: 1024
context_fusion: 'cross'
context_max_length: null
context_pe_method: 'none'
pe_method: 'none'
rope_mode: 'shared'
use_conv: true
skip: true
skip_norm: true
controlnet:
cond_in: 1
cond_blocks: [64, 128]
cond_mask: true
cond_mask_prob: 0.25
cond_mask_ratio: [0.25, 0.50]
cond_mask_span: 10
conditioner:
condition_type: energy
hop_size: 240
window_size: 1920
padding: 'reflect'
min_db: -60
norm: True
# usually use q_first as false like other studies
autoencoder:
name: stable_vae
dim: 128
sr: 24000
latent_sr: 50
q_first: true
scale: 1.0
shift: 0.0
# a fixed length should be set when using concat mode
# a fixed length should be set for distributed training
text_encoder:
model: google/flan-t5-large
max_length: 100
cfg: 0.1
diff:
num_train_timesteps: 1000
beta_schedule: 'scaled_linear'
beta_start: 0.00085
beta_end: 0.012
prediction_type: 'v_prediction'
rescale_betas_zero_snr: true
timestep_spacing: 'trailing'
clip_sample: false