{ "model_type": "autoencoder", "sample_size": 12000, "sample_rate": 24000, "audio_channels": 1, "model": { "encoder": { "type": "oobleck", "config": { "in_channels": 1, "channels": 128, "c_mults": [1, 2, 4, 8], "strides": [2, 4, 6, 10], "latent_dim": 256, "use_snake": true } }, "decoder": { "type": "oobleck", "config": { "out_channels": 1, "channels": 128, "c_mults": [1, 2, 4, 8], "strides": [2, 4, 6, 10], "latent_dim": 128, "use_snake": true, "final_tanh": false } }, "bottleneck": { "type": "vae" }, "latent_dim": 128, "downsampling_ratio": 480, "io_channels": 1 }, "training": { "learning_rate": 1.5e-4, "warmup_steps": 0, "use_ema": false, "optimizer_configs": { "autoencoder": { "optimizer": { "type": "AdamW", "config": { "betas": [0.8, 0.99], "lr": 1.5e-4, "weight_decay": 1e-3 } }, "scheduler": { "type": "InverseLR", "config": { "inv_gamma": 200000, "power": 0.5, "warmup": 0.999 } } }, "discriminator": { "optimizer": { "type": "AdamW", "config": { "betas": [0.8, 0.99], "lr": 3e-4, "weight_decay": 1e-3 } }, "scheduler": { "type": "InverseLR", "config": { "inv_gamma": 200000, "power": 0.5, "warmup": 0.999 } } } }, "loss_configs": { "discriminator": { "type": "encodec", "config": { "filters": 64, "n_ffts": [1280, 640, 320, 160, 80], "hop_lengths": [320, 160, 80, 40, 20], "win_lengths": [1280, 640, 320, 160, 80] }, "weights": { "adversarial": 0.1, "feature_matching": 5.0 } }, "spectral": { "type": "mrstft", "config": { "fft_sizes": [1280, 640, 320, 160, 80, 40, 20], "hop_sizes": [320, 160, 80, 40, 20, 10, 5], "win_lengths": [1280, 640, 320, 160, 80, 40, 20], "perceptual_weighting": true }, "weights": { "mrstft": 1.0 } }, "time": { "type": "l1", "weights": { "l1": 0.0 } }, "bottleneck": { "type": "kl", "weights": { "kl": 1e-4 } } }, "demo": { "demo_every": 10000 } } }