mali6
/

autocap

Model card Files Files and versions Community

autocap / genau-full-l.yaml

mali6

Upload genau-full-l.yaml with huggingface_hub

b366428 verified 7 months ago

raw

history blame

11.7 kB


	training:
	precision: "high"
	nodes_count: -1

	logging:
	project_name: "audioldm-snap"
	wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
	log_directory: "./log/latent_diffusion"

	# Saving Checkpoints
	# if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
	S3_BUCKET: "snap-genvid"
	S3_FOLDER: 'mali6/audioldm'
	save_checkpoint_every_n_steps: 1500
	save_top_k: -1


	variables:
	sampling_rate: &sampling_rate 16000
	mel_bins: &mel_bins 64
	latent_embed_dim: &latent_embed_dim 64
	latent_t_size: &latent_t_size 256 # TODO might need to change
	latent_f_size: &latent_f_size 1
	in_channels: &unet_in_channels 256
	optimize_ddpm_parameter: &optimize_ddpm_parameter true
	optimize_gpt: &optimize_gpt true
	warmup_steps: &warmup_steps 5000
	lr: &lr 5.0e-3
	mx_steps: &mx_steps 80000000
	batch_size: &bs 20 # TODO: change to 256

	data:
	metadata_root: "/fsx/mali6/datasets/metadata/dataset_root.json"
	train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
	val: "audiocaps"
	test: "audiocaps"
	class_label_indices: "audioset_eval_subset"
	dataloader_add_ons: []
	augment_p : 0.0
	num_workers: 48
	consistent_start_time: True

	keys_synonyms:
	gt_audio_caption:
	- audiocaps_gt_captions
	- gt_caption
	- gt_captions
	- caption
	- best_model_w_meta_pred_caption
	- gt_audio_caption
	- wavcaps_caption
	tags:
	- keywords
	- tags


	step:
	validation_every_n_epochs: 3
	save_checkpoint_every_n_steps: 1500
	# limit_val_batches: 1 # TODO: enable for test
	# limit_train_batches: 128 # TODO: enable for test
	max_steps: *mx_steps
	save_top_k: -1

	preprocessing:
	video:
	fps : 1
	height: 224
	width: 224
	audio:
	sampling_rate: *sampling_rate
	max_wav_value: 32768.0
	duration: 10.24
	stft:
	filter_length: 1024
	hop_length: 160
	win_length: 1024
	mel:
	n_mel_channels: *mel_bins
	mel_fmin: 0
	mel_fmax: 8000

	augmentation:
	mixup: 0.0

	model:
	target: src.models.genau_ddpm.GenAu
	params:
	# dataset token
	dataset_embed_dim: 32
	dataset2id:
	audiocaps: 0
	clotho: 1
	vggsounds: 2
	wavcaps_audioset_strong: 3
	wavcaps_bbcsound: 4
	wavcaps_freesound: 5
	wavcaps_soundbible: 6
	fsd50k: 7
	caption_audioset: 8


	# logging
	validate_uncond: False
	validate_wo_ema: True
	num_val_sampled_timestamps: 10

	# evaluation # disable evaluation
	# evaluator:
	# target: audioldm_eval.EvaluationHelper
	# params:
	# sampling_rate: 16000
	# device: 'cuda'

	# Optimizer
	optimizer_config:
	# Which optimizer to use
	target: !module src.modules.optimizers.lamb.Lamb
	# Which LR to use
	lr: *lr
	# The weight decay to use
	weight_decay: 0.01
	# Beta parameters for configs/experiments/getty_images_image_model/w480_debug.yaml
	betas: [0.9,0.99]
	# Eps parameter for Adam
	eps: 0.00000001

	base_learning_rate: *lr
	# Final lr for cosine annealing
	final_lr: 0.0015 # Use cosine lr scheduling but do not reach 0 as performance degrade with very small lr
	# Number of warmup steps
	warmup_steps: *warmup_steps
	# Number of steps between each lr update
	lr_update_each_steps: 10
	# Total number of training steps
	max_steps: *mx_steps # TODO enable

	# Autoencoder
	first_stage_config:
	base_learning_rate: 8.0e-06
	target: src.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
	params:
	# reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
	reload_from_ckpt: "1dvae_64ch_16k_64bins"
	sampling_rate: *sampling_rate
	batchsize: *bs # TODO: chagne
	monitor: val/rec_loss
	image_key: fbank
	subband: 1
	embed_dim: *latent_embed_dim
	time_shuffle: 1
	lossconfig:
	target: src.losses.LPIPSWithDiscriminator
	params:
	disc_start: 50001
	kl_weight: 1000.0
	disc_weight: 0.5
	disc_in_channels: 1
	ddconfig:
	double_z: true
	mel_bins: *mel_bins # The frequency bins of mel spectrogram
	z_channels: *unet_in_channels
	resolution: 256
	downsample_time: false
	in_channels: 64
	out_ch: 64 # in and out channels must stay as 64
	ch: 512
	ch_mult:
	- 1
	- 2
	- 4
	num_res_blocks: 3
	attn_resolutions: []
	dropout: 0.0

	# Other parameters
	clip_grad: 0.5
	optimize_ddpm_parameter: *optimize_ddpm_parameter
	sampling_rate: *sampling_rate
	batchsize: *bs
	linear_start: 0.0015 # in DDPM, a linear scheduler is used from 1e-4 to 0.2. LDM uses linera scheduler with same params. Make-an-audio uses different start and end values. Improved DDPM introduced coise and RIN introduced sigmoid one.
	linear_end: 0.0195
	num_timesteps_cond: 1
	log_every_t: 200
	timesteps: 1000
	unconditional_prob_cfg: 0.1
	parameterization: eps # [eps, x0, v]
	first_stage_key: fbank
	latent_t_size: *latent_t_size # TODO might need to change
	latent_f_size: *latent_f_size
	channels: *latent_embed_dim # TODO might need to change
	monitor: val/loss_simple_ema

	scale_by_std: True
	# scale_factor: 1.0144787


	backbone_type : fit
	unet_config:
	target: src.modules.fit.fit_audio.FIT

	params:
	weight_initializer:
	target: !module src.modules.initializers.initializers.RINWeightScalerInitializer
	scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings

	fit_block_module: !module src.modules.fit.layers.fit_layers.FITBlockV5
	context_channels: 1024
	summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size

	# If True inserts the conditioning information in the context
	conditioning_in_context: True

	# The type of positional encodings to use for the time input
	time_pe_type: learned
	# Uses a label that specifies whether the current input is a video or an image
	use_video_image_conditioning: False
	# Uses a label that specifies the framerate of the current video
	use_framerate_conditioning: False
	# Uses a label that specifies the id of the dataset from which the current input comes
	use_dataset_id_conditioning: True
	# Uses a label that specifies the resolution of the current input
	use_resolution_conditioning: False
	# If True uses the unmasked parts of the denoised input as conditioning
	use_denoised_input_conditioning: False

	# Size of the input in pixels
	input_size: [1, latent_t_size, latent_f_size] # (frames_count, height, widht)
	# The size in pixels of each patch
	patch_size: [1, 1, 1]
	# The number of patches in each group
	group_size: [1, 32, 1]
	input_channels: *latent_embed_dim
	# The number of channels in the patch embeddings
	patch_channels: 1024
	# The number of fit blocks
	fit_blocks_count: 6
	# The number of local layers in each fit block
	local_layers_per_block: 2
	# The number of global layers in each fit block
	global_layers_per_block: 4
	# The number of latent tokens
	latent_count: 256
	# The number of channels in the latent tokens
	latent_channels: 1536

	self_conditioning_ff_config: {}
	fit_block_config:
	attention_class: !module src.modules.fit.layers.rin_layers.Attention
	ff_class: !module src.modules.fit.layers.rin_layers.FeedForward

	# Dropout parameters
	drop_units: 0.1
	drop_path: 0.0

	# Whether to use feedforward layers after corss attention
	use_cross_attention_feedforward: True

	# Configuration for attention layers
	default_attention_config:
	heads: 8
	dim_head: 128
	read_attention_config:
	# Ensure heads * dim_head = min(input_channels, patch_channels)
	heads: 8
	dim_head: 128
	read_context_attention_config:
	# Ensure heads * dim_head = min(latent_channels, context_channels)
	heads: 8
	dim_head: 128
	read_latent_conditioning_attention_config:
	# Ensure heads * dim_head = latent_channels
	heads: 12
	dim_head: 128
	write_attention_config:
	# Ensure heads * dim_head = min(input_channels, patch_channels)
	heads: 8
	dim_head: 128
	local_attention_config:
	# Ensure heads * dim_head = patch_channels
	heads: 8
	dim_head: 128
	global_attention_config:
	# Ensure heads * dim_head = latent_channels
	heads: 12
	dim_head: 128

	ff_config: {}
	# unet_config:
	# target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel
	# params:
	# image_size: 64
	# extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
	# # context_dim:
	# # - 768
	# in_channels: *unet_in_channels # The input channel of the UNet model
	# out_channels: *latent_embed_dim # TODO might need to change
	# model_channels: 128 # TODO might need to change
	# attention_resolutions:
	# - 8
	# - 4
	# - 2
	# num_res_blocks: 2
	# channel_mult:
	# - 1
	# - 2
	# - 3
	# - 5
	# num_head_channels: 32
	# use_spatial_transformer: true
	# transformer_depth: 1
	# extra_sa_layer: false

	cond_stage_config:
	film_clap_cond1:
	cond_stage_key: text
	conditioning_key: film
	target: src.modules.conditional.conditional_models.CLAPAudioEmbeddingClassifierFreev2
	params:
	pretrained_path: clap_htsat_tiny
	sampling_rate: 16000
	embed_mode: text # or text
	amodel: HTSAT-tiny
	film_flan_t5_cond2:
	cond_stage_key: text
	conditioning_key: film
	target: src.modules.conditional.conditional_models.FlanT5HiddenState
	params:
	text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
	freeze_text_encoder: True
	return_embeds: True
	pool_tokens: True

	noncond_dataset_ids: # for none_fit backbone, please use film_dataset_ids and enable encode_dataset_ids
	cond_stage_key: all
	conditioning_key: ignore
	target: src.modules.conditional.conditional_models.DatasetIDs
	params:
	encode_dataset_ids: False
	dataset2id:
	audiocaps: 0
	clotho: 1
	vggsounds: 2
	wavcaps_audioset_strong: 3
	wavcaps_bbcsound: 4
	wavcaps_freesound: 5
	wavcaps_soundbible: 6
	fsd50k: 7
	caption_audioset: 8
	unconditional: 0 # set the uncondtional to 0 for future experiments



	evaluation_params:
	unconditional_guidance_scale: 3.5
	ddim_sampling_steps: 200
	n_candidates_per_samples: 3