# This is the hyperparameter configuration file for FastSpeech2 v2. | |
# the different of v2 and v1 is that v2 apply linformer technique. | |
# Please make sure this is adjusted for the Baker dataset. If you want to | |
# apply to the other dataset, you might need to carefully change some parameters. | |
# This configuration performs 200k iters but a best checkpoint is around 150k iters. | |
########################################################### | |
# FEATURE EXTRACTION SETTING # | |
########################################################### | |
hop_size: 256 # Hop size. | |
format: "npy" | |
########################################################### | |
# NETWORK ARCHITECTURE SETTING # | |
########################################################### | |
model_type: "fastspeech2" | |
fastspeech2_params: | |
dataset: baker | |
n_speakers: 1 | |
encoder_hidden_size: 256 | |
encoder_num_hidden_layers: 3 | |
encoder_num_attention_heads: 2 | |
encoder_attention_head_size: 16 # in v1, = 384//2 | |
encoder_intermediate_size: 1024 | |
encoder_intermediate_kernel_size: 3 | |
encoder_hidden_act: "mish" | |
decoder_hidden_size: 256 | |
decoder_num_hidden_layers: 3 | |
decoder_num_attention_heads: 2 | |
decoder_attention_head_size: 16 # in v1, = 384//2 | |
decoder_intermediate_size: 1024 | |
decoder_intermediate_kernel_size: 3 | |
decoder_hidden_act: "mish" | |
variant_prediction_num_conv_layers: 2 | |
variant_predictor_filter: 256 | |
variant_predictor_kernel_size: 3 | |
variant_predictor_dropout_rate: 0.5 | |
num_mels: 80 | |
hidden_dropout_prob: 0.2 | |
attention_probs_dropout_prob: 0.1 | |
max_position_embeddings: 2048 | |
initializer_range: 0.02 | |
output_attentions: False | |
output_hidden_states: False | |
########################################################### | |
# DATA LOADER SETTING # | |
########################################################### | |
batch_size: 16 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1. | |
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. | |
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. | |
mel_length_threshold: 32 # remove all targets has mel_length <= 32 | |
is_shuffle: true # shuffle dataset after each epoch. | |
########################################################### | |
# OPTIMIZER & SCHEDULER SETTING # | |
########################################################### | |
optimizer_params: | |
initial_learning_rate: 0.001 | |
end_learning_rate: 0.00005 | |
decay_steps: 150000 # < train_max_steps is recommend. | |
warmup_proportion: 0.02 | |
weight_decay: 0.001 | |
gradient_accumulation_steps: 1 | |
var_train_expr: null # trainable variable expr (eg. 'embeddings|encoder|decoder' ) | |
# must separate by |. if var_train_expr is null then we | |
# training all variable | |
########################################################### | |
# INTERVAL SETTING # | |
########################################################### | |
train_max_steps: 200000 # Number of training steps. | |
save_interval_steps: 5000 # Interval steps to save checkpoint. | |
eval_interval_steps: 500 # Interval steps to evaluate the network. | |
log_interval_steps: 200 # Interval steps to record the training log. | |
delay_f0_energy_steps: 3 # 2 steps use LR outputs only then 1 steps LR + F0 + Energy. | |
########################################################### | |
# OTHER SETTING # | |
########################################################### | |
num_save_intermediate_results: 1 # Number of batch to be saved as intermediate results. | |