mali6 commited on
Commit
720b7a8
·
verified ·
1 Parent(s): 5c17464

Upload genau-full-s.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. genau-full-s.yaml +346 -0
genau-full-s.yaml ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ training:
3
+ precision: "high"
4
+ nodes_count: -1
5
+
6
+
7
+ logging:
8
+ project_name: "audioldm-snap"
9
+ wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
10
+ log_directory: "./run_logs/genau/train"
11
+
12
+ # Saving Checkpoints
13
+ # if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
14
+ S3_BUCKET: "snap-genvid"
15
+ S3_FOLDER: 'mali6/audioldm'
16
+ save_checkpoint_every_n_steps: 1500
17
+ save_top_k: -1
18
+
19
+ variables:
20
+ sampling_rate: &sampling_rate 16000
21
+ mel_bins: &mel_bins 64
22
+ latent_embed_dim: &latent_embed_dim 64
23
+ latent_t_size: &latent_t_size 256 # TODO might need to change
24
+ latent_f_size: &latent_f_size 1
25
+ in_channels: &unet_in_channels 256
26
+ optimize_ddpm_parameter: &optimize_ddpm_parameter true
27
+ optimize_gpt: &optimize_gpt true
28
+ warmup_steps: &warmup_steps 5000
29
+ lr: &lr 5.0e-3
30
+ mx_steps: &mx_steps 8000000
31
+ batch_size: &bs 36 # TODO: change to 256
32
+
33
+
34
+ data:
35
+ train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
36
+ val: "autocap"
37
+ test: "autocap"
38
+ class_label_indices: "audioset_eval_subset"
39
+ dataloader_add_ons: []
40
+ augment_p : 0.0
41
+ num_workers: 48
42
+ consistent_start_time: True
43
+
44
+ keys_synonyms:
45
+ gt_audio_caption:
46
+ - audiocaps_gt_captions
47
+ - gt_caption
48
+ - gt_captions
49
+ - caption
50
+ - best_model_w_meta_pred_caption
51
+ - gt_audio_caption
52
+ - autocap_caption
53
+ - wavcaps_caption
54
+ tags:
55
+ - keywords
56
+ - tags
57
+
58
+
59
+ step:
60
+ validation_every_n_epochs: 50
61
+ save_checkpoint_every_n_steps: 2500
62
+ # limit_val_batches: 4 # TODO: enable for test
63
+ # limit_train_batches: 1 # TODO: enable for test
64
+ max_steps: *mx_steps
65
+ save_top_k: -1
66
+
67
+ preprocessing:
68
+ video:
69
+ fps : 1
70
+ height: 224
71
+ width: 224
72
+ audio:
73
+ sampling_rate: *sampling_rate
74
+ max_wav_value: 32768.0
75
+ duration: 10.24
76
+ stft:
77
+ filter_length: 1024
78
+ hop_length: 160
79
+ win_length: 1024
80
+ mel:
81
+ n_mel_channels: *mel_bins
82
+ mel_fmin: 0
83
+ mel_fmax: 8000
84
+
85
+ augmentation:
86
+ mixup: 0.0
87
+
88
+ model:
89
+ target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
90
+ params:
91
+ # dataset token
92
+ dataset_embed_dim: 32
93
+
94
+ # logging
95
+ log_uncond: False
96
+ validation_wo_ema: True
97
+ num_val_sampled_timestamps: 10
98
+
99
+ # # evaluation
100
+ # evaluator:
101
+ # target: audioldm_eval.EvaluationHelper
102
+ # params:
103
+ # sampling_rate: 16000
104
+ # device: 'cuda'
105
+
106
+ # Optimizer
107
+ optimizer_config:
108
+ # Which optimizer to use
109
+ target: !module audioldm_train.modules.snapvideo.training.optimizers.lamb.Lamb
110
+ # Which LR to use
111
+ lr: *lr
112
+ # The weight decay to use
113
+ weight_decay: 0.01
114
+ # Beta parameters for configs/experiments/getty_images_image_model/w480_debug.yaml
115
+ betas: [0.9,0.99]
116
+ # Eps parameter for Adam
117
+ eps: 0.00000001
118
+
119
+ base_learning_rate: *lr
120
+ # Final lr for cosine annealing
121
+ final_lr: 0.0015 # Use cosine lr scheduling but do not reach 0 as performance degrade with very small lr
122
+ # Number of warmup steps
123
+ warmup_steps: *warmup_steps
124
+ # Number of steps between each lr update
125
+ lr_update_each_steps: 10
126
+ # Total number of training steps
127
+ max_steps: *mx_steps # TODO enable
128
+
129
+ # Autoencoder
130
+ first_stage_config:
131
+ base_learning_rate: 8.0e-06
132
+ target: audioldm_train.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
133
+ params:
134
+ # reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
135
+ reload_from_ckpt: "/fsx/mali6/repos/AudioLDM2-training/log/vae_checkpoints/vae_64hdcheckpoint-344999.ckpt"
136
+ sampling_rate: *sampling_rate
137
+ batchsize: *bs # TODO: chagne
138
+ monitor: val/rec_loss
139
+ image_key: fbank
140
+ subband: 1
141
+ embed_dim: *latent_embed_dim
142
+ time_shuffle: 1
143
+ lossconfig:
144
+ target: audioldm_train.losses.LPIPSWithDiscriminator
145
+ params:
146
+ disc_start: 50001
147
+ kl_weight: 1000.0
148
+ disc_weight: 0.5
149
+ disc_in_channels: 1
150
+ ddconfig:
151
+ double_z: true
152
+ mel_bins: *mel_bins # The frequency bins of mel spectrogram
153
+ z_channels: *unet_in_channels
154
+ resolution: 256
155
+ downsample_time: false
156
+ in_channels: 64
157
+ out_ch: 64 # in and out channels must stay as 64
158
+ ch: 512
159
+ ch_mult:
160
+ - 1
161
+ - 2
162
+ - 4
163
+ num_res_blocks: 3
164
+ attn_resolutions: []
165
+ dropout: 0.0
166
+
167
+ # Other parameters
168
+ clip_grad: 0.5
169
+ optimize_ddpm_parameter: *optimize_ddpm_parameter
170
+ sampling_rate: *sampling_rate
171
+ batchsize: *bs
172
+ linear_start: 0.0015 # in DDPM, a linear scheduler is used from 1e-4 to 0.2. LDM uses linera scheduler with same params. Make-an-audio uses different start and end values. Improved DDPM introduced coise and RIN introduced sigmoid one.
173
+ linear_end: 0.0195
174
+ num_timesteps_cond: 1
175
+ log_every_t: 200
176
+ timesteps: 1000
177
+ unconditional_prob_cfg: 0.1
178
+ parameterization: eps # [eps, x0, v]
179
+ first_stage_key: fbank
180
+ latent_t_size: *latent_t_size # TODO might need to change
181
+ latent_f_size: *latent_f_size
182
+ channels: *latent_embed_dim # TODO might need to change
183
+ monitor: val/loss_simple_ema
184
+
185
+ scale_by_std: True
186
+ # scale_factor: 1.0144787
187
+
188
+
189
+ backbone_type : fit
190
+ unet_config:
191
+ target: audioldm_train.modules.snapvideo.models.vision.backbones.fit_audio.FIT
192
+
193
+ params:
194
+ weight_initializer:
195
+ target: !module audioldm_train.modules.snapvideo.models.initializers.rin_weight_scaler_initializer.RINWeightScalerInitializer
196
+ scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
197
+
198
+ fit_block_module: !module audioldm_train.modules.snapvideo.models.vision.layers.fit_block_v5.FITBlockV5
199
+ context_channels: 1024
200
+ summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
201
+
202
+ # If True inserts the conditioning information in the context
203
+ conditioning_in_context: True
204
+
205
+ # The type of positional encodings to use for the time input
206
+ time_pe_type: learned
207
+ # Uses a label that specifies the framerate of the current video
208
+ use_dataset_id_conditioning: True
209
+ # Uses a label that specifies the resolution of the current input
210
+ use_resolution_conditioning: False
211
+
212
+ # Size of the input in pixels
213
+ input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
214
+ # The size in pixels of each patch
215
+ patch_size: [1, 1, 1]
216
+ # The number of patches in each group
217
+ group_size: [1, 32, 1]
218
+ input_channels: *latent_embed_dim
219
+ # The number of channels in the patch embeddings
220
+ patch_channels: 1024
221
+ # The number of fit blocks
222
+ fit_blocks_count: 4
223
+ # The number of local layers in each fit block
224
+ local_layers_per_block: 2
225
+ # The number of global layers in each fit block
226
+ global_layers_per_block: 4
227
+ # The number of latent tokens
228
+ latent_count: 256
229
+ # The number of channels in the latent tokens
230
+ latent_channels: 1024
231
+
232
+ self_conditioning_ff_config: {}
233
+ fit_block_config:
234
+ attention_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.Attention
235
+ ff_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.FeedForward
236
+
237
+ # Dropout parameters
238
+ drop_units: 0.1
239
+ drop_path: 0.0
240
+
241
+ # Whether to use feedforward layers after corss attention
242
+ use_cross_attention_feedforward: True
243
+
244
+ # Configuration for attention layers
245
+ default_attention_config:
246
+ heads: 8
247
+ dim_head: 128
248
+ read_attention_config:
249
+ # Ensure heads * dim_head = min(input_channels, patch_channels)
250
+ heads: 8
251
+ dim_head: 128
252
+ read_context_attention_config:
253
+ # Ensure heads * dim_head = min(latent_channels, context_channels)
254
+ heads: 8
255
+ dim_head: 128
256
+ read_latent_conditioning_attention_config:
257
+ # Ensure heads * dim_head = latent_channels
258
+ heads: 8
259
+ dim_head: 128
260
+ write_attention_config:
261
+ # Ensure heads * dim_head = min(input_channels, patch_channels)
262
+ heads: 8
263
+ dim_head: 128
264
+ local_attention_config:
265
+ # Ensure heads * dim_head = patch_channels
266
+ heads: 8
267
+ dim_head: 128
268
+ global_attention_config:
269
+ # Ensure heads * dim_head = latent_channels
270
+ heads: 8
271
+ dim_head: 128
272
+
273
+ ff_config: {}
274
+ # unet_config:
275
+ # target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel
276
+ # params:
277
+ # image_size: 64
278
+ # extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
279
+ # # context_dim:
280
+ # # - 768
281
+ # in_channels: *unet_in_channels # The input channel of the UNet model
282
+ # out_channels: *latent_embed_dim # TODO might need to change
283
+ # model_channels: 128 # TODO might need to change
284
+ # attention_resolutions:
285
+ # - 8
286
+ # - 4
287
+ # - 2
288
+ # num_res_blocks: 2
289
+ # channel_mult:
290
+ # - 1
291
+ # - 2
292
+ # - 3
293
+ # - 5
294
+ # num_head_channels: 32
295
+ # use_spatial_transformer: true
296
+ # transformer_depth: 1
297
+ # extra_sa_layer: false
298
+
299
+ cond_stage_config:
300
+ film_clap_cond1:
301
+ cond_stage_key: text
302
+ conditioning_key: film
303
+ target: audioldm_train.conditional_models.CLAPAudioEmbeddingClassifierFreev2
304
+ params:
305
+ pretrained_path: data/checkpoints/clap_htsat_tiny.pt
306
+ sampling_rate: 16000
307
+ embed_mode: text # or text
308
+ amodel: HTSAT-tiny
309
+ film_flan_t5_cond2:
310
+ cond_stage_key: text
311
+ conditioning_key: film
312
+ target: audioldm_train.conditional_models.FlanT5HiddenState
313
+ params:
314
+ text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
315
+ freeze_text_encoder: True
316
+ return_embeds: True
317
+ pool_tokens: True
318
+
319
+ noncond_dataset_ids: # for none_fit backbone, please use film_dataset_ids and enable encode_dataset_ids
320
+ cond_stage_key: all
321
+ conditioning_key: ignore
322
+ target: src.modules.conditional.conditional_models.DatasetIDs
323
+ params:
324
+ encode_dataset_ids: False
325
+ dataset2id:
326
+ audiocaps: 0
327
+ clotho: 1
328
+ vggsounds: 2
329
+ wavcaps_audioset_strong: 3
330
+ wavcaps_bbcsound: 4
331
+ wavcaps_freesound: 5
332
+ wavcaps_soundbible: 6
333
+ fsd50k: 7
334
+ caption_audioset: 8
335
+ autocap: 9
336
+ unconditional: 0 # set the uncondtional to 0 for future experiments
337
+
338
+
339
+
340
+ evaluation_params:
341
+ unconditional_guidance_scale: 3.5
342
+ ddim_sampling_steps: 200
343
+ n_candidates_per_samples: 3
344
+
345
+
346
+