File size: 5,394 Bytes
b128f21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]
lang_csv: Swahili
output_folder: !ref results/finetune_hubert_ASR_char/<seed>/<lang_csv>
output_wer_folder: !ref <output_folder>/
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
# huggingface format
hubert_hub: Orange/SSA-HuBERT-base-60k
hubert_folder: !ref <save_folder>/hubert_checkpoint
# Data files
data_folder: !ref PATH_TO_YOUR_FOLDER/data_speechbrain/<lang_csv>
ckpt_interval_minutes: 10 # save checkpoint every N min
train_csv: !ref <data_folder>/train.csv
valid_csv: !ref <data_folder>/validation.csv
test_csv:
- !ref <data_folder>/test.csv
####################### Training Parameters ####################################
number_of_epochs: 10
lr: 0.1
lr_hubert: 0.000005
sorting: ascending
precision: fp32 # bf16, fp16 or fp32
sample_rate: 16000
# skip audio file longer than
avoid_if_longer_than: 60
batch_size: 2
test_batch_size: 2
# Dataloader options
train_dataloader_opts:
batch_size: !ref <batch_size>
valid_dataloader_opts:
batch_size: !ref <batch_size>
test_dataloader_opts:
batch_size: !ref <test_batch_size>
####################### Model Parameters #######################################
activation: !name:torch.nn.LeakyReLU
dnn_layers: 2
dnn_neurons: 1024
freeze_hubert: False
# Outputs
output_neurons: 66 # BPE size, index(blank/eos/bos) = 0
blank_index: 0
#
# Functions and classes
#
label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
source: !ref <hubert_hub>
output_norm: True
freeze: !ref <freeze_hubert>
save_path: !ref <hubert_folder>
top_lin: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
input_shape: [null, null, 768] # 768 == output of hubert base model
activation: !ref <activation>
dnn_blocks: !ref <dnn_layers>
dnn_neurons: !ref <dnn_neurons>
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dnn_neurons>
n_neurons: !ref <output_neurons>
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
blank_index: !ref <blank_index>
modules:
hubert: !ref <hubert>
top_lin: !ref <top_lin>
ctc_lin: !ref <ctc_lin>
model: !new:torch.nn.ModuleList
- [!ref <top_lin>, !ref <ctc_lin>]
model_opt_class: !name:torch.optim.Adadelta
lr: !ref <lr>
rho: 0.95
eps: 1.e-8
hubert_opt_class: !name:torch.optim.Adam
lr: !ref <lr_hubert>
lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: !ref <lr>
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 0
lr_annealing_hubert: !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: !ref <lr_hubert>
improvement_threshold: 0.0025
annealing_factor: 0.9
patient: 0
############################## Augmentations ###################################
# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
orig_freq: !ref <sample_rate>
speeds: [95, 100, 105]
# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
drop_freq_low: 0
drop_freq_high: 1
drop_freq_count_low: 1
drop_freq_count_high: 3
drop_freq_width: 0.05
# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
drop_length_low: 1000
drop_length_high: 2000
drop_count_low: 1
drop_count_high: 5
# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
concat_original: True
min_augmentations: 4
max_augmentations: 4
augment_prob: 1.0
augmentations: [
!ref <speed_perturb>,
!ref <drop_freq>,
!ref <drop_chunk>]
############################## Decoding ########################################
# Decoding parameters
test_beam_search:
beam_size: 143
topk: 1
blank_index: !ref <blank_index>
space_token: ' ' # make sure this is the same as the one used in the tokenizer
beam_prune_logp: -12.0
token_prune_min_logp: -1.20
prune_history: True
alpha: 0.8
beta: 1.2
# can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
# It can either be a .bin or .arpa ; note: .arpa is much slower at loading
# If you don't want to use an LM, comment it out or set it to null
kenlm_model_path: null
############################## Logging and Pretrainer ##########################
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
hubert: !ref <hubert>
model: !ref <model>
scheduler_model: !ref <lr_annealing_model>
scheduler_hubert: !ref <lr_annealing_hubert>
counter: !ref <epoch_counter>
tokenizer: !ref <label_encoder>
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
split_tokens: True
|