Upload genau-full-l.yaml with huggingface_hub
Browse files- genau-full-l.yaml +361 -0
genau-full-l.yaml
ADDED
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
training:
|
3 |
+
precision: "high"
|
4 |
+
nodes_count: -1
|
5 |
+
|
6 |
+
logging:
|
7 |
+
project_name: "audioldm-snap"
|
8 |
+
wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
|
9 |
+
log_directory: "./log/latent_diffusion"
|
10 |
+
|
11 |
+
# Saving Checkpoints
|
12 |
+
# if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
|
13 |
+
S3_BUCKET: "snap-genvid"
|
14 |
+
S3_FOLDER: 'mali6/audioldm'
|
15 |
+
save_checkpoint_every_n_steps: 1500
|
16 |
+
save_top_k: -1
|
17 |
+
|
18 |
+
|
19 |
+
variables:
|
20 |
+
sampling_rate: &sampling_rate 16000
|
21 |
+
mel_bins: &mel_bins 64
|
22 |
+
latent_embed_dim: &latent_embed_dim 64
|
23 |
+
latent_t_size: &latent_t_size 256 # TODO might need to change
|
24 |
+
latent_f_size: &latent_f_size 1
|
25 |
+
in_channels: &unet_in_channels 256
|
26 |
+
optimize_ddpm_parameter: &optimize_ddpm_parameter true
|
27 |
+
optimize_gpt: &optimize_gpt true
|
28 |
+
warmup_steps: &warmup_steps 5000
|
29 |
+
lr: &lr 5.0e-3
|
30 |
+
mx_steps: &mx_steps 80000000
|
31 |
+
batch_size: &bs 20 # TODO: change to 256
|
32 |
+
|
33 |
+
data:
|
34 |
+
metadata_root: "/fsx/mali6/datasets/metadata/dataset_root.json"
|
35 |
+
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
|
36 |
+
val: "audiocaps"
|
37 |
+
test: "audiocaps"
|
38 |
+
class_label_indices: "audioset_eval_subset"
|
39 |
+
dataloader_add_ons: []
|
40 |
+
augment_p : 0.0
|
41 |
+
num_workers: 48
|
42 |
+
consistent_start_time: True
|
43 |
+
|
44 |
+
keys_synonyms:
|
45 |
+
gt_audio_caption:
|
46 |
+
- audiocaps_gt_captions
|
47 |
+
- gt_caption
|
48 |
+
- gt_captions
|
49 |
+
- caption
|
50 |
+
- best_model_w_meta_pred_caption
|
51 |
+
- gt_audio_caption
|
52 |
+
- wavcaps_caption
|
53 |
+
tags:
|
54 |
+
- keywords
|
55 |
+
- tags
|
56 |
+
|
57 |
+
|
58 |
+
step:
|
59 |
+
validation_every_n_epochs: 3
|
60 |
+
save_checkpoint_every_n_steps: 1500
|
61 |
+
# limit_val_batches: 1 # TODO: enable for test
|
62 |
+
# limit_train_batches: 128 # TODO: enable for test
|
63 |
+
max_steps: *mx_steps
|
64 |
+
save_top_k: -1
|
65 |
+
|
66 |
+
preprocessing:
|
67 |
+
video:
|
68 |
+
fps : 1
|
69 |
+
height: 224
|
70 |
+
width: 224
|
71 |
+
audio:
|
72 |
+
sampling_rate: *sampling_rate
|
73 |
+
max_wav_value: 32768.0
|
74 |
+
duration: 10.24
|
75 |
+
stft:
|
76 |
+
filter_length: 1024
|
77 |
+
hop_length: 160
|
78 |
+
win_length: 1024
|
79 |
+
mel:
|
80 |
+
n_mel_channels: *mel_bins
|
81 |
+
mel_fmin: 0
|
82 |
+
mel_fmax: 8000
|
83 |
+
|
84 |
+
augmentation:
|
85 |
+
mixup: 0.0
|
86 |
+
|
87 |
+
model:
|
88 |
+
target: src.models.genau_ddpm.GenAu
|
89 |
+
params:
|
90 |
+
# dataset token
|
91 |
+
dataset_embed_dim: 32
|
92 |
+
dataset2id:
|
93 |
+
audiocaps: 0
|
94 |
+
clotho: 1
|
95 |
+
vggsounds: 2
|
96 |
+
wavcaps_audioset_strong: 3
|
97 |
+
wavcaps_bbcsound: 4
|
98 |
+
wavcaps_freesound: 5
|
99 |
+
wavcaps_soundbible: 6
|
100 |
+
fsd50k: 7
|
101 |
+
caption_audioset: 8
|
102 |
+
|
103 |
+
|
104 |
+
# logging
|
105 |
+
validate_uncond: False
|
106 |
+
validate_wo_ema: True
|
107 |
+
num_val_sampled_timestamps: 10
|
108 |
+
|
109 |
+
# evaluation # disable evaluation
|
110 |
+
# evaluator:
|
111 |
+
# target: audioldm_eval.EvaluationHelper
|
112 |
+
# params:
|
113 |
+
# sampling_rate: 16000
|
114 |
+
# device: 'cuda'
|
115 |
+
|
116 |
+
# Optimizer
|
117 |
+
optimizer_config:
|
118 |
+
# Which optimizer to use
|
119 |
+
target: !module src.modules.optimizers.lamb.Lamb
|
120 |
+
# Which LR to use
|
121 |
+
lr: *lr
|
122 |
+
# The weight decay to use
|
123 |
+
weight_decay: 0.01
|
124 |
+
# Beta parameters for configs/experiments/getty_images_image_model/w480_debug.yaml
|
125 |
+
betas: [0.9,0.99]
|
126 |
+
# Eps parameter for Adam
|
127 |
+
eps: 0.00000001
|
128 |
+
|
129 |
+
base_learning_rate: *lr
|
130 |
+
# Final lr for cosine annealing
|
131 |
+
final_lr: 0.0015 # Use cosine lr scheduling but do not reach 0 as performance degrade with very small lr
|
132 |
+
# Number of warmup steps
|
133 |
+
warmup_steps: *warmup_steps
|
134 |
+
# Number of steps between each lr update
|
135 |
+
lr_update_each_steps: 10
|
136 |
+
# Total number of training steps
|
137 |
+
max_steps: *mx_steps # TODO enable
|
138 |
+
|
139 |
+
# Autoencoder
|
140 |
+
first_stage_config:
|
141 |
+
base_learning_rate: 8.0e-06
|
142 |
+
target: src.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
|
143 |
+
params:
|
144 |
+
# reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
|
145 |
+
reload_from_ckpt: "1dvae_64ch_16k_64bins"
|
146 |
+
sampling_rate: *sampling_rate
|
147 |
+
batchsize: *bs # TODO: chagne
|
148 |
+
monitor: val/rec_loss
|
149 |
+
image_key: fbank
|
150 |
+
subband: 1
|
151 |
+
embed_dim: *latent_embed_dim
|
152 |
+
time_shuffle: 1
|
153 |
+
lossconfig:
|
154 |
+
target: src.losses.LPIPSWithDiscriminator
|
155 |
+
params:
|
156 |
+
disc_start: 50001
|
157 |
+
kl_weight: 1000.0
|
158 |
+
disc_weight: 0.5
|
159 |
+
disc_in_channels: 1
|
160 |
+
ddconfig:
|
161 |
+
double_z: true
|
162 |
+
mel_bins: *mel_bins # The frequency bins of mel spectrogram
|
163 |
+
z_channels: *unet_in_channels
|
164 |
+
resolution: 256
|
165 |
+
downsample_time: false
|
166 |
+
in_channels: 64
|
167 |
+
out_ch: 64 # in and out channels must stay as 64
|
168 |
+
ch: 512
|
169 |
+
ch_mult:
|
170 |
+
- 1
|
171 |
+
- 2
|
172 |
+
- 4
|
173 |
+
num_res_blocks: 3
|
174 |
+
attn_resolutions: []
|
175 |
+
dropout: 0.0
|
176 |
+
|
177 |
+
# Other parameters
|
178 |
+
clip_grad: 0.5
|
179 |
+
optimize_ddpm_parameter: *optimize_ddpm_parameter
|
180 |
+
sampling_rate: *sampling_rate
|
181 |
+
batchsize: *bs
|
182 |
+
linear_start: 0.0015 # in DDPM, a linear scheduler is used from 1e-4 to 0.2. LDM uses linera scheduler with same params. Make-an-audio uses different start and end values. Improved DDPM introduced coise and RIN introduced sigmoid one.
|
183 |
+
linear_end: 0.0195
|
184 |
+
num_timesteps_cond: 1
|
185 |
+
log_every_t: 200
|
186 |
+
timesteps: 1000
|
187 |
+
unconditional_prob_cfg: 0.1
|
188 |
+
parameterization: eps # [eps, x0, v]
|
189 |
+
first_stage_key: fbank
|
190 |
+
latent_t_size: *latent_t_size # TODO might need to change
|
191 |
+
latent_f_size: *latent_f_size
|
192 |
+
channels: *latent_embed_dim # TODO might need to change
|
193 |
+
monitor: val/loss_simple_ema
|
194 |
+
|
195 |
+
scale_by_std: True
|
196 |
+
# scale_factor: 1.0144787
|
197 |
+
|
198 |
+
|
199 |
+
backbone_type : fit
|
200 |
+
unet_config:
|
201 |
+
target: src.modules.fit.fit_audio.FIT
|
202 |
+
|
203 |
+
params:
|
204 |
+
weight_initializer:
|
205 |
+
target: !module src.modules.initializers.initializers.RINWeightScalerInitializer
|
206 |
+
scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
|
207 |
+
|
208 |
+
fit_block_module: !module src.modules.fit.layers.fit_layers.FITBlockV5
|
209 |
+
context_channels: 1024
|
210 |
+
summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
|
211 |
+
|
212 |
+
# If True inserts the conditioning information in the context
|
213 |
+
conditioning_in_context: True
|
214 |
+
|
215 |
+
# The type of positional encodings to use for the time input
|
216 |
+
time_pe_type: learned
|
217 |
+
# Uses a label that specifies whether the current input is a video or an image
|
218 |
+
use_video_image_conditioning: False
|
219 |
+
# Uses a label that specifies the framerate of the current video
|
220 |
+
use_framerate_conditioning: False
|
221 |
+
# Uses a label that specifies the id of the dataset from which the current input comes
|
222 |
+
use_dataset_id_conditioning: True
|
223 |
+
# Uses a label that specifies the resolution of the current input
|
224 |
+
use_resolution_conditioning: False
|
225 |
+
# If True uses the unmasked parts of the denoised input as conditioning
|
226 |
+
use_denoised_input_conditioning: False
|
227 |
+
|
228 |
+
# Size of the input in pixels
|
229 |
+
input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
|
230 |
+
# The size in pixels of each patch
|
231 |
+
patch_size: [1, 1, 1]
|
232 |
+
# The number of patches in each group
|
233 |
+
group_size: [1, 32, 1]
|
234 |
+
input_channels: *latent_embed_dim
|
235 |
+
# The number of channels in the patch embeddings
|
236 |
+
patch_channels: 1024
|
237 |
+
# The number of fit blocks
|
238 |
+
fit_blocks_count: 6
|
239 |
+
# The number of local layers in each fit block
|
240 |
+
local_layers_per_block: 2
|
241 |
+
# The number of global layers in each fit block
|
242 |
+
global_layers_per_block: 4
|
243 |
+
# The number of latent tokens
|
244 |
+
latent_count: 256
|
245 |
+
# The number of channels in the latent tokens
|
246 |
+
latent_channels: 1536
|
247 |
+
|
248 |
+
self_conditioning_ff_config: {}
|
249 |
+
fit_block_config:
|
250 |
+
attention_class: !module src.modules.fit.layers.rin_layers.Attention
|
251 |
+
ff_class: !module src.modules.fit.layers.rin_layers.FeedForward
|
252 |
+
|
253 |
+
# Dropout parameters
|
254 |
+
drop_units: 0.1
|
255 |
+
drop_path: 0.0
|
256 |
+
|
257 |
+
# Whether to use feedforward layers after corss attention
|
258 |
+
use_cross_attention_feedforward: True
|
259 |
+
|
260 |
+
# Configuration for attention layers
|
261 |
+
default_attention_config:
|
262 |
+
heads: 8
|
263 |
+
dim_head: 128
|
264 |
+
read_attention_config:
|
265 |
+
# Ensure heads * dim_head = min(input_channels, patch_channels)
|
266 |
+
heads: 8
|
267 |
+
dim_head: 128
|
268 |
+
read_context_attention_config:
|
269 |
+
# Ensure heads * dim_head = min(latent_channels, context_channels)
|
270 |
+
heads: 8
|
271 |
+
dim_head: 128
|
272 |
+
read_latent_conditioning_attention_config:
|
273 |
+
# Ensure heads * dim_head = latent_channels
|
274 |
+
heads: 12
|
275 |
+
dim_head: 128
|
276 |
+
write_attention_config:
|
277 |
+
# Ensure heads * dim_head = min(input_channels, patch_channels)
|
278 |
+
heads: 8
|
279 |
+
dim_head: 128
|
280 |
+
local_attention_config:
|
281 |
+
# Ensure heads * dim_head = patch_channels
|
282 |
+
heads: 8
|
283 |
+
dim_head: 128
|
284 |
+
global_attention_config:
|
285 |
+
# Ensure heads * dim_head = latent_channels
|
286 |
+
heads: 12
|
287 |
+
dim_head: 128
|
288 |
+
|
289 |
+
ff_config: {}
|
290 |
+
# unet_config:
|
291 |
+
# target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel
|
292 |
+
# params:
|
293 |
+
# image_size: 64
|
294 |
+
# extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
|
295 |
+
# # context_dim:
|
296 |
+
# # - 768
|
297 |
+
# in_channels: *unet_in_channels # The input channel of the UNet model
|
298 |
+
# out_channels: *latent_embed_dim # TODO might need to change
|
299 |
+
# model_channels: 128 # TODO might need to change
|
300 |
+
# attention_resolutions:
|
301 |
+
# - 8
|
302 |
+
# - 4
|
303 |
+
# - 2
|
304 |
+
# num_res_blocks: 2
|
305 |
+
# channel_mult:
|
306 |
+
# - 1
|
307 |
+
# - 2
|
308 |
+
# - 3
|
309 |
+
# - 5
|
310 |
+
# num_head_channels: 32
|
311 |
+
# use_spatial_transformer: true
|
312 |
+
# transformer_depth: 1
|
313 |
+
# extra_sa_layer: false
|
314 |
+
|
315 |
+
cond_stage_config:
|
316 |
+
film_clap_cond1:
|
317 |
+
cond_stage_key: text
|
318 |
+
conditioning_key: film
|
319 |
+
target: src.modules.conditional.conditional_models.CLAPAudioEmbeddingClassifierFreev2
|
320 |
+
params:
|
321 |
+
pretrained_path: clap_htsat_tiny
|
322 |
+
sampling_rate: 16000
|
323 |
+
embed_mode: text # or text
|
324 |
+
amodel: HTSAT-tiny
|
325 |
+
film_flan_t5_cond2:
|
326 |
+
cond_stage_key: text
|
327 |
+
conditioning_key: film
|
328 |
+
target: src.modules.conditional.conditional_models.FlanT5HiddenState
|
329 |
+
params:
|
330 |
+
text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
|
331 |
+
freeze_text_encoder: True
|
332 |
+
return_embeds: True
|
333 |
+
pool_tokens: True
|
334 |
+
|
335 |
+
noncond_dataset_ids: # for none_fit backbone, please use film_dataset_ids and enable encode_dataset_ids
|
336 |
+
cond_stage_key: all
|
337 |
+
conditioning_key: ignore
|
338 |
+
target: src.modules.conditional.conditional_models.DatasetIDs
|
339 |
+
params:
|
340 |
+
encode_dataset_ids: False
|
341 |
+
dataset2id:
|
342 |
+
audiocaps: 0
|
343 |
+
clotho: 1
|
344 |
+
vggsounds: 2
|
345 |
+
wavcaps_audioset_strong: 3
|
346 |
+
wavcaps_bbcsound: 4
|
347 |
+
wavcaps_freesound: 5
|
348 |
+
wavcaps_soundbible: 6
|
349 |
+
fsd50k: 7
|
350 |
+
caption_audioset: 8
|
351 |
+
unconditional: 0 # set the uncondtional to 0 for future experiments
|
352 |
+
|
353 |
+
|
354 |
+
|
355 |
+
evaluation_params:
|
356 |
+
unconditional_guidance_scale: 3.5
|
357 |
+
ddim_sampling_steps: 200
|
358 |
+
n_candidates_per_samples: 3
|
359 |
+
|
360 |
+
|
361 |
+
|