mali6 commited on
Commit
b366428
·
verified ·
1 Parent(s): f74681e

Upload genau-full-l.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. genau-full-l.yaml +361 -0
genau-full-l.yaml ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ training:
3
+ precision: "high"
4
+ nodes_count: -1
5
+
6
+ logging:
7
+ project_name: "audioldm-snap"
8
+ wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
9
+ log_directory: "./log/latent_diffusion"
10
+
11
+ # Saving Checkpoints
12
+ # if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
13
+ S3_BUCKET: "snap-genvid"
14
+ S3_FOLDER: 'mali6/audioldm'
15
+ save_checkpoint_every_n_steps: 1500
16
+ save_top_k: -1
17
+
18
+
19
+ variables:
20
+ sampling_rate: &sampling_rate 16000
21
+ mel_bins: &mel_bins 64
22
+ latent_embed_dim: &latent_embed_dim 64
23
+ latent_t_size: &latent_t_size 256 # TODO might need to change
24
+ latent_f_size: &latent_f_size 1
25
+ in_channels: &unet_in_channels 256
26
+ optimize_ddpm_parameter: &optimize_ddpm_parameter true
27
+ optimize_gpt: &optimize_gpt true
28
+ warmup_steps: &warmup_steps 5000
29
+ lr: &lr 5.0e-3
30
+ mx_steps: &mx_steps 80000000
31
+ batch_size: &bs 20 # TODO: change to 256
32
+
33
+ data:
34
+ metadata_root: "/fsx/mali6/datasets/metadata/dataset_root.json"
35
+ train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
36
+ val: "audiocaps"
37
+ test: "audiocaps"
38
+ class_label_indices: "audioset_eval_subset"
39
+ dataloader_add_ons: []
40
+ augment_p : 0.0
41
+ num_workers: 48
42
+ consistent_start_time: True
43
+
44
+ keys_synonyms:
45
+ gt_audio_caption:
46
+ - audiocaps_gt_captions
47
+ - gt_caption
48
+ - gt_captions
49
+ - caption
50
+ - best_model_w_meta_pred_caption
51
+ - gt_audio_caption
52
+ - wavcaps_caption
53
+ tags:
54
+ - keywords
55
+ - tags
56
+
57
+
58
+ step:
59
+ validation_every_n_epochs: 3
60
+ save_checkpoint_every_n_steps: 1500
61
+ # limit_val_batches: 1 # TODO: enable for test
62
+ # limit_train_batches: 128 # TODO: enable for test
63
+ max_steps: *mx_steps
64
+ save_top_k: -1
65
+
66
+ preprocessing:
67
+ video:
68
+ fps : 1
69
+ height: 224
70
+ width: 224
71
+ audio:
72
+ sampling_rate: *sampling_rate
73
+ max_wav_value: 32768.0
74
+ duration: 10.24
75
+ stft:
76
+ filter_length: 1024
77
+ hop_length: 160
78
+ win_length: 1024
79
+ mel:
80
+ n_mel_channels: *mel_bins
81
+ mel_fmin: 0
82
+ mel_fmax: 8000
83
+
84
+ augmentation:
85
+ mixup: 0.0
86
+
87
+ model:
88
+ target: src.models.genau_ddpm.GenAu
89
+ params:
90
+ # dataset token
91
+ dataset_embed_dim: 32
92
+ dataset2id:
93
+ audiocaps: 0
94
+ clotho: 1
95
+ vggsounds: 2
96
+ wavcaps_audioset_strong: 3
97
+ wavcaps_bbcsound: 4
98
+ wavcaps_freesound: 5
99
+ wavcaps_soundbible: 6
100
+ fsd50k: 7
101
+ caption_audioset: 8
102
+
103
+
104
+ # logging
105
+ validate_uncond: False
106
+ validate_wo_ema: True
107
+ num_val_sampled_timestamps: 10
108
+
109
+ # evaluation # disable evaluation
110
+ # evaluator:
111
+ # target: audioldm_eval.EvaluationHelper
112
+ # params:
113
+ # sampling_rate: 16000
114
+ # device: 'cuda'
115
+
116
+ # Optimizer
117
+ optimizer_config:
118
+ # Which optimizer to use
119
+ target: !module src.modules.optimizers.lamb.Lamb
120
+ # Which LR to use
121
+ lr: *lr
122
+ # The weight decay to use
123
+ weight_decay: 0.01
124
+ # Beta parameters for configs/experiments/getty_images_image_model/w480_debug.yaml
125
+ betas: [0.9,0.99]
126
+ # Eps parameter for Adam
127
+ eps: 0.00000001
128
+
129
+ base_learning_rate: *lr
130
+ # Final lr for cosine annealing
131
+ final_lr: 0.0015 # Use cosine lr scheduling but do not reach 0 as performance degrade with very small lr
132
+ # Number of warmup steps
133
+ warmup_steps: *warmup_steps
134
+ # Number of steps between each lr update
135
+ lr_update_each_steps: 10
136
+ # Total number of training steps
137
+ max_steps: *mx_steps # TODO enable
138
+
139
+ # Autoencoder
140
+ first_stage_config:
141
+ base_learning_rate: 8.0e-06
142
+ target: src.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
143
+ params:
144
+ # reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
145
+ reload_from_ckpt: "1dvae_64ch_16k_64bins"
146
+ sampling_rate: *sampling_rate
147
+ batchsize: *bs # TODO: chagne
148
+ monitor: val/rec_loss
149
+ image_key: fbank
150
+ subband: 1
151
+ embed_dim: *latent_embed_dim
152
+ time_shuffle: 1
153
+ lossconfig:
154
+ target: src.losses.LPIPSWithDiscriminator
155
+ params:
156
+ disc_start: 50001
157
+ kl_weight: 1000.0
158
+ disc_weight: 0.5
159
+ disc_in_channels: 1
160
+ ddconfig:
161
+ double_z: true
162
+ mel_bins: *mel_bins # The frequency bins of mel spectrogram
163
+ z_channels: *unet_in_channels
164
+ resolution: 256
165
+ downsample_time: false
166
+ in_channels: 64
167
+ out_ch: 64 # in and out channels must stay as 64
168
+ ch: 512
169
+ ch_mult:
170
+ - 1
171
+ - 2
172
+ - 4
173
+ num_res_blocks: 3
174
+ attn_resolutions: []
175
+ dropout: 0.0
176
+
177
+ # Other parameters
178
+ clip_grad: 0.5
179
+ optimize_ddpm_parameter: *optimize_ddpm_parameter
180
+ sampling_rate: *sampling_rate
181
+ batchsize: *bs
182
+ linear_start: 0.0015 # in DDPM, a linear scheduler is used from 1e-4 to 0.2. LDM uses linera scheduler with same params. Make-an-audio uses different start and end values. Improved DDPM introduced coise and RIN introduced sigmoid one.
183
+ linear_end: 0.0195
184
+ num_timesteps_cond: 1
185
+ log_every_t: 200
186
+ timesteps: 1000
187
+ unconditional_prob_cfg: 0.1
188
+ parameterization: eps # [eps, x0, v]
189
+ first_stage_key: fbank
190
+ latent_t_size: *latent_t_size # TODO might need to change
191
+ latent_f_size: *latent_f_size
192
+ channels: *latent_embed_dim # TODO might need to change
193
+ monitor: val/loss_simple_ema
194
+
195
+ scale_by_std: True
196
+ # scale_factor: 1.0144787
197
+
198
+
199
+ backbone_type : fit
200
+ unet_config:
201
+ target: src.modules.fit.fit_audio.FIT
202
+
203
+ params:
204
+ weight_initializer:
205
+ target: !module src.modules.initializers.initializers.RINWeightScalerInitializer
206
+ scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
207
+
208
+ fit_block_module: !module src.modules.fit.layers.fit_layers.FITBlockV5
209
+ context_channels: 1024
210
+ summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
211
+
212
+ # If True inserts the conditioning information in the context
213
+ conditioning_in_context: True
214
+
215
+ # The type of positional encodings to use for the time input
216
+ time_pe_type: learned
217
+ # Uses a label that specifies whether the current input is a video or an image
218
+ use_video_image_conditioning: False
219
+ # Uses a label that specifies the framerate of the current video
220
+ use_framerate_conditioning: False
221
+ # Uses a label that specifies the id of the dataset from which the current input comes
222
+ use_dataset_id_conditioning: True
223
+ # Uses a label that specifies the resolution of the current input
224
+ use_resolution_conditioning: False
225
+ # If True uses the unmasked parts of the denoised input as conditioning
226
+ use_denoised_input_conditioning: False
227
+
228
+ # Size of the input in pixels
229
+ input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
230
+ # The size in pixels of each patch
231
+ patch_size: [1, 1, 1]
232
+ # The number of patches in each group
233
+ group_size: [1, 32, 1]
234
+ input_channels: *latent_embed_dim
235
+ # The number of channels in the patch embeddings
236
+ patch_channels: 1024
237
+ # The number of fit blocks
238
+ fit_blocks_count: 6
239
+ # The number of local layers in each fit block
240
+ local_layers_per_block: 2
241
+ # The number of global layers in each fit block
242
+ global_layers_per_block: 4
243
+ # The number of latent tokens
244
+ latent_count: 256
245
+ # The number of channels in the latent tokens
246
+ latent_channels: 1536
247
+
248
+ self_conditioning_ff_config: {}
249
+ fit_block_config:
250
+ attention_class: !module src.modules.fit.layers.rin_layers.Attention
251
+ ff_class: !module src.modules.fit.layers.rin_layers.FeedForward
252
+
253
+ # Dropout parameters
254
+ drop_units: 0.1
255
+ drop_path: 0.0
256
+
257
+ # Whether to use feedforward layers after corss attention
258
+ use_cross_attention_feedforward: True
259
+
260
+ # Configuration for attention layers
261
+ default_attention_config:
262
+ heads: 8
263
+ dim_head: 128
264
+ read_attention_config:
265
+ # Ensure heads * dim_head = min(input_channels, patch_channels)
266
+ heads: 8
267
+ dim_head: 128
268
+ read_context_attention_config:
269
+ # Ensure heads * dim_head = min(latent_channels, context_channels)
270
+ heads: 8
271
+ dim_head: 128
272
+ read_latent_conditioning_attention_config:
273
+ # Ensure heads * dim_head = latent_channels
274
+ heads: 12
275
+ dim_head: 128
276
+ write_attention_config:
277
+ # Ensure heads * dim_head = min(input_channels, patch_channels)
278
+ heads: 8
279
+ dim_head: 128
280
+ local_attention_config:
281
+ # Ensure heads * dim_head = patch_channels
282
+ heads: 8
283
+ dim_head: 128
284
+ global_attention_config:
285
+ # Ensure heads * dim_head = latent_channels
286
+ heads: 12
287
+ dim_head: 128
288
+
289
+ ff_config: {}
290
+ # unet_config:
291
+ # target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel
292
+ # params:
293
+ # image_size: 64
294
+ # extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
295
+ # # context_dim:
296
+ # # - 768
297
+ # in_channels: *unet_in_channels # The input channel of the UNet model
298
+ # out_channels: *latent_embed_dim # TODO might need to change
299
+ # model_channels: 128 # TODO might need to change
300
+ # attention_resolutions:
301
+ # - 8
302
+ # - 4
303
+ # - 2
304
+ # num_res_blocks: 2
305
+ # channel_mult:
306
+ # - 1
307
+ # - 2
308
+ # - 3
309
+ # - 5
310
+ # num_head_channels: 32
311
+ # use_spatial_transformer: true
312
+ # transformer_depth: 1
313
+ # extra_sa_layer: false
314
+
315
+ cond_stage_config:
316
+ film_clap_cond1:
317
+ cond_stage_key: text
318
+ conditioning_key: film
319
+ target: src.modules.conditional.conditional_models.CLAPAudioEmbeddingClassifierFreev2
320
+ params:
321
+ pretrained_path: clap_htsat_tiny
322
+ sampling_rate: 16000
323
+ embed_mode: text # or text
324
+ amodel: HTSAT-tiny
325
+ film_flan_t5_cond2:
326
+ cond_stage_key: text
327
+ conditioning_key: film
328
+ target: src.modules.conditional.conditional_models.FlanT5HiddenState
329
+ params:
330
+ text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
331
+ freeze_text_encoder: True
332
+ return_embeds: True
333
+ pool_tokens: True
334
+
335
+ noncond_dataset_ids: # for none_fit backbone, please use film_dataset_ids and enable encode_dataset_ids
336
+ cond_stage_key: all
337
+ conditioning_key: ignore
338
+ target: src.modules.conditional.conditional_models.DatasetIDs
339
+ params:
340
+ encode_dataset_ids: False
341
+ dataset2id:
342
+ audiocaps: 0
343
+ clotho: 1
344
+ vggsounds: 2
345
+ wavcaps_audioset_strong: 3
346
+ wavcaps_bbcsound: 4
347
+ wavcaps_freesound: 5
348
+ wavcaps_soundbible: 6
349
+ fsd50k: 7
350
+ caption_audioset: 8
351
+ unconditional: 0 # set the uncondtional to 0 for future experiments
352
+
353
+
354
+
355
+ evaluation_params:
356
+ unconditional_guidance_scale: 3.5
357
+ ddim_sampling_steps: 200
358
+ n_candidates_per_samples: 3
359
+
360
+
361
+