|
from __gin__ import dynamic_registration |
|
import cached_conv as cc |
|
from cached_conv import convs |
|
import rave |
|
from rave import blocks |
|
from rave import core |
|
from rave import dataset |
|
from rave import descript_discriminator |
|
from rave import discriminator |
|
from rave import model |
|
from rave import pqmf |
|
import torch |
|
import torch.nn as nn |
|
|
|
# Macros: |
|
# ============================================================================== |
|
ACTIVATION = @blocks.Snake |
|
CAPACITY = 96 |
|
DILATIONS = [[1, 3, 9], [1, 3, 9], [1, 3, 9], [1, 3]] |
|
KERNEL_SIZE = 3 |
|
LATENT_SIZE = 128 |
|
N_BAND = 16 |
|
NOISE_AUGMENTATION = 0 |
|
PHASE_1_DURATION = 0 |
|
RATIOS = [4, 4, 4, 2] |
|
SAMPLING_RATE = 48000 |
|
|
|
# Parameters for blocks.AdaptiveInstanceNormalization: |
|
# ============================================================================== |
|
# None. |
|
|
|
# Parameters for variational/blocks.AdaptiveInstanceNormalization: |
|
# ============================================================================== |
|
# None. |
|
|
|
# Parameters for core.AudioDistanceV1: |
|
# ============================================================================== |
|
core.AudioDistanceV1.log_epsilon = 1e-07 |
|
core.AudioDistanceV1.multiscale_stft = @core.MultiScaleSTFT |
|
|
|
# Parameters for model.BetaWarmupCallback: |
|
# ============================================================================== |
|
model.BetaWarmupCallback.initial_value = 1e-06 |
|
model.BetaWarmupCallback.target_value = 0.005 |
|
model.BetaWarmupCallback.warmup_len = 20000 |
|
|
|
# Parameters for pqmf.CachedPQMF: |
|
# ============================================================================== |
|
pqmf.CachedPQMF.attenuation = 100 |
|
pqmf.CachedPQMF.n_band = %N_BAND |
|
|
|
# Parameters for cc.Conv1d: |
|
# ============================================================================== |
|
cc.Conv1d.bias = False |
|
|
|
# Parameters for variational/cc.Conv1d: |
|
# ============================================================================== |
|
variational/cc.Conv1d.bias = False |
|
|
|
# Parameters for cc.ConvTranspose1d: |
|
# ============================================================================== |
|
cc.ConvTranspose1d.bias = False |
|
|
|
# Parameters for descript_discriminator.DescriptDiscriminator: |
|
# ============================================================================== |
|
descript_discriminator.DescriptDiscriminator.bands = \ |
|
[(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)] |
|
descript_discriminator.DescriptDiscriminator.fft_sizes = [2048, 1024, 512] |
|
descript_discriminator.DescriptDiscriminator.periods = [2, 3, 5, 7, 11] |
|
descript_discriminator.DescriptDiscriminator.rates = [] |
|
descript_discriminator.DescriptDiscriminator.sample_rate = 44100 |
|
|
|
# Parameters for variational/blocks.EncoderV2: |
|
# ============================================================================== |
|
variational/blocks.EncoderV2.activation = %ACTIVATION |
|
variational/blocks.EncoderV2.adain = @blocks.AdaptiveInstanceNormalization |
|
variational/blocks.EncoderV2.capacity = %CAPACITY |
|
variational/blocks.EncoderV2.data_size = %N_BAND |
|
variational/blocks.EncoderV2.dilations = %DILATIONS |
|
variational/blocks.EncoderV2.keep_dim = False |
|
variational/blocks.EncoderV2.kernel_size = %KERNEL_SIZE |
|
variational/blocks.EncoderV2.latent_size = %LATENT_SIZE |
|
variational/blocks.EncoderV2.n_out = 2 |
|
variational/blocks.EncoderV2.ratios = %RATIOS |
|
variational/blocks.EncoderV2.recurrent_layer = None |
|
variational/blocks.EncoderV2.spectrogram = None |
|
|
|
# Parameters for blocks.GeneratorV2: |
|
# ============================================================================== |
|
blocks.GeneratorV2.activation = %ACTIVATION |
|
blocks.GeneratorV2.adain = @blocks.AdaptiveInstanceNormalization |
|
blocks.GeneratorV2.amplitude_modulation = True |
|
blocks.GeneratorV2.capacity = %CAPACITY |
|
blocks.GeneratorV2.causal_convtranspose = False |
|
blocks.GeneratorV2.data_size = %N_BAND |
|
blocks.GeneratorV2.dilations = %DILATIONS |
|
blocks.GeneratorV2.keep_dim = False |
|
blocks.GeneratorV2.kernel_size = %KERNEL_SIZE |
|
blocks.GeneratorV2.latent_size = @core.get_augmented_latent_size() |
|
blocks.GeneratorV2.noise_module = None |
|
blocks.GeneratorV2.ratios = %RATIOS |
|
blocks.GeneratorV2.recurrent_layer = None |
|
|
|
# Parameters for core.get_augmented_latent_size: |
|
# ============================================================================== |
|
core.get_augmented_latent_size.latent_size = %LATENT_SIZE |
|
core.get_augmented_latent_size.noise_augmentation = %NOISE_AUGMENTATION |
|
|
|
# Parameters for convs.get_padding: |
|
# ============================================================================== |
|
convs.get_padding.dilation = 1 |
|
convs.get_padding.mode = 'causal' |
|
convs.get_padding.stride = 1 |
|
|
|
# Parameters for variational/convs.get_padding: |
|
# ============================================================================== |
|
variational/convs.get_padding.dilation = 1 |
|
variational/convs.get_padding.mode = 'causal' |
|
variational/convs.get_padding.stride = 1 |
|
|
|
# Parameters for core.MultiScaleSTFT: |
|
# ============================================================================== |
|
core.MultiScaleSTFT.magnitude = True |
|
core.MultiScaleSTFT.normalized = False |
|
core.MultiScaleSTFT.num_mels = None |
|
core.MultiScaleSTFT.random_crop = True |
|
core.MultiScaleSTFT.sample_rate = %SAMPLING_RATE |
|
core.MultiScaleSTFT.scales = [2048, 1024, 512, 256, 128] |
|
|
|
# Parameters for blocks.normalization: |
|
# ============================================================================== |
|
blocks.normalization.mode = 'weight_norm' |
|
|
|
# Parameters for variational/blocks.normalization: |
|
# ============================================================================== |
|
variational/blocks.normalization.mode = 'weight_norm' |
|
|
|
# Parameters for model.RAVE: |
|
# ============================================================================== |
|
model.RAVE.audio_distance = @core.AudioDistanceV1 |
|
model.RAVE.decoder = @blocks.GeneratorV2 |
|
model.RAVE.discriminator = @descript_discriminator.DescriptDiscriminator |
|
model.RAVE.enable_pqmf_decode = True |
|
model.RAVE.enable_pqmf_encode = True |
|
model.RAVE.encoder = @blocks.VariationalEncoder |
|
model.RAVE.feature_matching_fun = @feature_matching/core.mean_difference |
|
model.RAVE.freeze_encoder = False |
|
model.RAVE.gan_loss = @core.hinge_gan |
|
model.RAVE.latent_size = %LATENT_SIZE |
|
model.RAVE.multiband_audio_distance = @core.AudioDistanceV1 |
|
model.RAVE.num_skipped_features = 1 |
|
model.RAVE.phase_1_duration = %PHASE_1_DURATION |
|
model.RAVE.pqmf = @pqmf.CachedPQMF |
|
model.RAVE.sampling_rate = %SAMPLING_RATE |
|
model.RAVE.update_discriminator_every = 4 |
|
model.RAVE.valid_signal_crop = True |
|
model.RAVE.warmup_quantize = None |
|
model.RAVE.weights = {'feature_matching': 20} |
|
|
|
# Parameters for blocks.Snake: |
|
# ============================================================================== |
|
# None. |
|
|
|
# Parameters for variational/blocks.Snake: |
|
# ============================================================================== |
|
# None. |
|
|
|
# Parameters for dataset.split_dataset: |
|
# ============================================================================== |
|
dataset.split_dataset.max_residual = 1000 |
|
|
|
# Parameters for blocks.VariationalEncoder: |
|
# ============================================================================== |
|
blocks.VariationalEncoder.encoder = @variational/blocks.EncoderV2 |
|
|