Upload genau-full-s.yaml with huggingface_hub
Browse files- genau-full-s.yaml +346 -0
genau-full-s.yaml
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
training:
|
3 |
+
precision: "high"
|
4 |
+
nodes_count: -1
|
5 |
+
|
6 |
+
|
7 |
+
logging:
|
8 |
+
project_name: "audioldm-snap"
|
9 |
+
wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
|
10 |
+
log_directory: "./run_logs/genau/train"
|
11 |
+
|
12 |
+
# Saving Checkpoints
|
13 |
+
# if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
|
14 |
+
S3_BUCKET: "snap-genvid"
|
15 |
+
S3_FOLDER: 'mali6/audioldm'
|
16 |
+
save_checkpoint_every_n_steps: 1500
|
17 |
+
save_top_k: -1
|
18 |
+
|
19 |
+
variables:
|
20 |
+
sampling_rate: &sampling_rate 16000
|
21 |
+
mel_bins: &mel_bins 64
|
22 |
+
latent_embed_dim: &latent_embed_dim 64
|
23 |
+
latent_t_size: &latent_t_size 256 # TODO might need to change
|
24 |
+
latent_f_size: &latent_f_size 1
|
25 |
+
in_channels: &unet_in_channels 256
|
26 |
+
optimize_ddpm_parameter: &optimize_ddpm_parameter true
|
27 |
+
optimize_gpt: &optimize_gpt true
|
28 |
+
warmup_steps: &warmup_steps 5000
|
29 |
+
lr: &lr 5.0e-3
|
30 |
+
mx_steps: &mx_steps 8000000
|
31 |
+
batch_size: &bs 36 # TODO: change to 256
|
32 |
+
|
33 |
+
|
34 |
+
data:
|
35 |
+
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
|
36 |
+
val: "autocap"
|
37 |
+
test: "autocap"
|
38 |
+
class_label_indices: "audioset_eval_subset"
|
39 |
+
dataloader_add_ons: []
|
40 |
+
augment_p : 0.0
|
41 |
+
num_workers: 48
|
42 |
+
consistent_start_time: True
|
43 |
+
|
44 |
+
keys_synonyms:
|
45 |
+
gt_audio_caption:
|
46 |
+
- audiocaps_gt_captions
|
47 |
+
- gt_caption
|
48 |
+
- gt_captions
|
49 |
+
- caption
|
50 |
+
- best_model_w_meta_pred_caption
|
51 |
+
- gt_audio_caption
|
52 |
+
- autocap_caption
|
53 |
+
- wavcaps_caption
|
54 |
+
tags:
|
55 |
+
- keywords
|
56 |
+
- tags
|
57 |
+
|
58 |
+
|
59 |
+
step:
|
60 |
+
validation_every_n_epochs: 50
|
61 |
+
save_checkpoint_every_n_steps: 2500
|
62 |
+
# limit_val_batches: 4 # TODO: enable for test
|
63 |
+
# limit_train_batches: 1 # TODO: enable for test
|
64 |
+
max_steps: *mx_steps
|
65 |
+
save_top_k: -1
|
66 |
+
|
67 |
+
preprocessing:
|
68 |
+
video:
|
69 |
+
fps : 1
|
70 |
+
height: 224
|
71 |
+
width: 224
|
72 |
+
audio:
|
73 |
+
sampling_rate: *sampling_rate
|
74 |
+
max_wav_value: 32768.0
|
75 |
+
duration: 10.24
|
76 |
+
stft:
|
77 |
+
filter_length: 1024
|
78 |
+
hop_length: 160
|
79 |
+
win_length: 1024
|
80 |
+
mel:
|
81 |
+
n_mel_channels: *mel_bins
|
82 |
+
mel_fmin: 0
|
83 |
+
mel_fmax: 8000
|
84 |
+
|
85 |
+
augmentation:
|
86 |
+
mixup: 0.0
|
87 |
+
|
88 |
+
model:
|
89 |
+
target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
|
90 |
+
params:
|
91 |
+
# dataset token
|
92 |
+
dataset_embed_dim: 32
|
93 |
+
|
94 |
+
# logging
|
95 |
+
log_uncond: False
|
96 |
+
validation_wo_ema: True
|
97 |
+
num_val_sampled_timestamps: 10
|
98 |
+
|
99 |
+
# # evaluation
|
100 |
+
# evaluator:
|
101 |
+
# target: audioldm_eval.EvaluationHelper
|
102 |
+
# params:
|
103 |
+
# sampling_rate: 16000
|
104 |
+
# device: 'cuda'
|
105 |
+
|
106 |
+
# Optimizer
|
107 |
+
optimizer_config:
|
108 |
+
# Which optimizer to use
|
109 |
+
target: !module audioldm_train.modules.snapvideo.training.optimizers.lamb.Lamb
|
110 |
+
# Which LR to use
|
111 |
+
lr: *lr
|
112 |
+
# The weight decay to use
|
113 |
+
weight_decay: 0.01
|
114 |
+
# Beta parameters for configs/experiments/getty_images_image_model/w480_debug.yaml
|
115 |
+
betas: [0.9,0.99]
|
116 |
+
# Eps parameter for Adam
|
117 |
+
eps: 0.00000001
|
118 |
+
|
119 |
+
base_learning_rate: *lr
|
120 |
+
# Final lr for cosine annealing
|
121 |
+
final_lr: 0.0015 # Use cosine lr scheduling but do not reach 0 as performance degrade with very small lr
|
122 |
+
# Number of warmup steps
|
123 |
+
warmup_steps: *warmup_steps
|
124 |
+
# Number of steps between each lr update
|
125 |
+
lr_update_each_steps: 10
|
126 |
+
# Total number of training steps
|
127 |
+
max_steps: *mx_steps # TODO enable
|
128 |
+
|
129 |
+
# Autoencoder
|
130 |
+
first_stage_config:
|
131 |
+
base_learning_rate: 8.0e-06
|
132 |
+
target: audioldm_train.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
|
133 |
+
params:
|
134 |
+
# reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
|
135 |
+
reload_from_ckpt: "/fsx/mali6/repos/AudioLDM2-training/log/vae_checkpoints/vae_64hdcheckpoint-344999.ckpt"
|
136 |
+
sampling_rate: *sampling_rate
|
137 |
+
batchsize: *bs # TODO: chagne
|
138 |
+
monitor: val/rec_loss
|
139 |
+
image_key: fbank
|
140 |
+
subband: 1
|
141 |
+
embed_dim: *latent_embed_dim
|
142 |
+
time_shuffle: 1
|
143 |
+
lossconfig:
|
144 |
+
target: audioldm_train.losses.LPIPSWithDiscriminator
|
145 |
+
params:
|
146 |
+
disc_start: 50001
|
147 |
+
kl_weight: 1000.0
|
148 |
+
disc_weight: 0.5
|
149 |
+
disc_in_channels: 1
|
150 |
+
ddconfig:
|
151 |
+
double_z: true
|
152 |
+
mel_bins: *mel_bins # The frequency bins of mel spectrogram
|
153 |
+
z_channels: *unet_in_channels
|
154 |
+
resolution: 256
|
155 |
+
downsample_time: false
|
156 |
+
in_channels: 64
|
157 |
+
out_ch: 64 # in and out channels must stay as 64
|
158 |
+
ch: 512
|
159 |
+
ch_mult:
|
160 |
+
- 1
|
161 |
+
- 2
|
162 |
+
- 4
|
163 |
+
num_res_blocks: 3
|
164 |
+
attn_resolutions: []
|
165 |
+
dropout: 0.0
|
166 |
+
|
167 |
+
# Other parameters
|
168 |
+
clip_grad: 0.5
|
169 |
+
optimize_ddpm_parameter: *optimize_ddpm_parameter
|
170 |
+
sampling_rate: *sampling_rate
|
171 |
+
batchsize: *bs
|
172 |
+
linear_start: 0.0015 # in DDPM, a linear scheduler is used from 1e-4 to 0.2. LDM uses linera scheduler with same params. Make-an-audio uses different start and end values. Improved DDPM introduced coise and RIN introduced sigmoid one.
|
173 |
+
linear_end: 0.0195
|
174 |
+
num_timesteps_cond: 1
|
175 |
+
log_every_t: 200
|
176 |
+
timesteps: 1000
|
177 |
+
unconditional_prob_cfg: 0.1
|
178 |
+
parameterization: eps # [eps, x0, v]
|
179 |
+
first_stage_key: fbank
|
180 |
+
latent_t_size: *latent_t_size # TODO might need to change
|
181 |
+
latent_f_size: *latent_f_size
|
182 |
+
channels: *latent_embed_dim # TODO might need to change
|
183 |
+
monitor: val/loss_simple_ema
|
184 |
+
|
185 |
+
scale_by_std: True
|
186 |
+
# scale_factor: 1.0144787
|
187 |
+
|
188 |
+
|
189 |
+
backbone_type : fit
|
190 |
+
unet_config:
|
191 |
+
target: audioldm_train.modules.snapvideo.models.vision.backbones.fit_audio.FIT
|
192 |
+
|
193 |
+
params:
|
194 |
+
weight_initializer:
|
195 |
+
target: !module audioldm_train.modules.snapvideo.models.initializers.rin_weight_scaler_initializer.RINWeightScalerInitializer
|
196 |
+
scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
|
197 |
+
|
198 |
+
fit_block_module: !module audioldm_train.modules.snapvideo.models.vision.layers.fit_block_v5.FITBlockV5
|
199 |
+
context_channels: 1024
|
200 |
+
summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
|
201 |
+
|
202 |
+
# If True inserts the conditioning information in the context
|
203 |
+
conditioning_in_context: True
|
204 |
+
|
205 |
+
# The type of positional encodings to use for the time input
|
206 |
+
time_pe_type: learned
|
207 |
+
# Uses a label that specifies the framerate of the current video
|
208 |
+
use_dataset_id_conditioning: True
|
209 |
+
# Uses a label that specifies the resolution of the current input
|
210 |
+
use_resolution_conditioning: False
|
211 |
+
|
212 |
+
# Size of the input in pixels
|
213 |
+
input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
|
214 |
+
# The size in pixels of each patch
|
215 |
+
patch_size: [1, 1, 1]
|
216 |
+
# The number of patches in each group
|
217 |
+
group_size: [1, 32, 1]
|
218 |
+
input_channels: *latent_embed_dim
|
219 |
+
# The number of channels in the patch embeddings
|
220 |
+
patch_channels: 1024
|
221 |
+
# The number of fit blocks
|
222 |
+
fit_blocks_count: 4
|
223 |
+
# The number of local layers in each fit block
|
224 |
+
local_layers_per_block: 2
|
225 |
+
# The number of global layers in each fit block
|
226 |
+
global_layers_per_block: 4
|
227 |
+
# The number of latent tokens
|
228 |
+
latent_count: 256
|
229 |
+
# The number of channels in the latent tokens
|
230 |
+
latent_channels: 1024
|
231 |
+
|
232 |
+
self_conditioning_ff_config: {}
|
233 |
+
fit_block_config:
|
234 |
+
attention_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.Attention
|
235 |
+
ff_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.FeedForward
|
236 |
+
|
237 |
+
# Dropout parameters
|
238 |
+
drop_units: 0.1
|
239 |
+
drop_path: 0.0
|
240 |
+
|
241 |
+
# Whether to use feedforward layers after corss attention
|
242 |
+
use_cross_attention_feedforward: True
|
243 |
+
|
244 |
+
# Configuration for attention layers
|
245 |
+
default_attention_config:
|
246 |
+
heads: 8
|
247 |
+
dim_head: 128
|
248 |
+
read_attention_config:
|
249 |
+
# Ensure heads * dim_head = min(input_channels, patch_channels)
|
250 |
+
heads: 8
|
251 |
+
dim_head: 128
|
252 |
+
read_context_attention_config:
|
253 |
+
# Ensure heads * dim_head = min(latent_channels, context_channels)
|
254 |
+
heads: 8
|
255 |
+
dim_head: 128
|
256 |
+
read_latent_conditioning_attention_config:
|
257 |
+
# Ensure heads * dim_head = latent_channels
|
258 |
+
heads: 8
|
259 |
+
dim_head: 128
|
260 |
+
write_attention_config:
|
261 |
+
# Ensure heads * dim_head = min(input_channels, patch_channels)
|
262 |
+
heads: 8
|
263 |
+
dim_head: 128
|
264 |
+
local_attention_config:
|
265 |
+
# Ensure heads * dim_head = patch_channels
|
266 |
+
heads: 8
|
267 |
+
dim_head: 128
|
268 |
+
global_attention_config:
|
269 |
+
# Ensure heads * dim_head = latent_channels
|
270 |
+
heads: 8
|
271 |
+
dim_head: 128
|
272 |
+
|
273 |
+
ff_config: {}
|
274 |
+
# unet_config:
|
275 |
+
# target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel
|
276 |
+
# params:
|
277 |
+
# image_size: 64
|
278 |
+
# extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
|
279 |
+
# # context_dim:
|
280 |
+
# # - 768
|
281 |
+
# in_channels: *unet_in_channels # The input channel of the UNet model
|
282 |
+
# out_channels: *latent_embed_dim # TODO might need to change
|
283 |
+
# model_channels: 128 # TODO might need to change
|
284 |
+
# attention_resolutions:
|
285 |
+
# - 8
|
286 |
+
# - 4
|
287 |
+
# - 2
|
288 |
+
# num_res_blocks: 2
|
289 |
+
# channel_mult:
|
290 |
+
# - 1
|
291 |
+
# - 2
|
292 |
+
# - 3
|
293 |
+
# - 5
|
294 |
+
# num_head_channels: 32
|
295 |
+
# use_spatial_transformer: true
|
296 |
+
# transformer_depth: 1
|
297 |
+
# extra_sa_layer: false
|
298 |
+
|
299 |
+
cond_stage_config:
|
300 |
+
film_clap_cond1:
|
301 |
+
cond_stage_key: text
|
302 |
+
conditioning_key: film
|
303 |
+
target: audioldm_train.conditional_models.CLAPAudioEmbeddingClassifierFreev2
|
304 |
+
params:
|
305 |
+
pretrained_path: data/checkpoints/clap_htsat_tiny.pt
|
306 |
+
sampling_rate: 16000
|
307 |
+
embed_mode: text # or text
|
308 |
+
amodel: HTSAT-tiny
|
309 |
+
film_flan_t5_cond2:
|
310 |
+
cond_stage_key: text
|
311 |
+
conditioning_key: film
|
312 |
+
target: audioldm_train.conditional_models.FlanT5HiddenState
|
313 |
+
params:
|
314 |
+
text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
|
315 |
+
freeze_text_encoder: True
|
316 |
+
return_embeds: True
|
317 |
+
pool_tokens: True
|
318 |
+
|
319 |
+
noncond_dataset_ids: # for none_fit backbone, please use film_dataset_ids and enable encode_dataset_ids
|
320 |
+
cond_stage_key: all
|
321 |
+
conditioning_key: ignore
|
322 |
+
target: src.modules.conditional.conditional_models.DatasetIDs
|
323 |
+
params:
|
324 |
+
encode_dataset_ids: False
|
325 |
+
dataset2id:
|
326 |
+
audiocaps: 0
|
327 |
+
clotho: 1
|
328 |
+
vggsounds: 2
|
329 |
+
wavcaps_audioset_strong: 3
|
330 |
+
wavcaps_bbcsound: 4
|
331 |
+
wavcaps_freesound: 5
|
332 |
+
wavcaps_soundbible: 6
|
333 |
+
fsd50k: 7
|
334 |
+
caption_audioset: 8
|
335 |
+
autocap: 9
|
336 |
+
unconditional: 0 # set the uncondtional to 0 for future experiments
|
337 |
+
|
338 |
+
|
339 |
+
|
340 |
+
evaluation_params:
|
341 |
+
unconditional_guidance_scale: 3.5
|
342 |
+
ddim_sampling_steps: 200
|
343 |
+
n_candidates_per_samples: 3
|
344 |
+
|
345 |
+
|
346 |
+
|