File size: 6,024 Bytes
aedfd73 561b4b9 aedfd73 0dffa1e aedfd73 c71d794 aedfd73 a64a2d3 0dffa1e aedfd73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
# ############################################################################
# Model: Streaming E2E Conformer-Transducer ASR
# Encoder: Conformer
# Decoder: LSTM + greedy search
# Tokens: BPE with unigram
# losses: Transducer + CTC (optional) + CE (optional)
# Training: Librispeech 960h
# Authors: Sylvain de Langen 2023, Titouan Parcollet 2023
# ############################################################################
save_folder: !ref librispeech-streaming-conformer-transducer
# Feature parameters
sample_rate: 16000
n_fft: 512
n_mels: 80
win_length: 32
# Streaming
streaming: True # controls all Dynamic Chunk Training & chunk size & left context mechanisms
# Model parameters
# Transformer
d_model: 512
joint_dim: 640
nhead: 8
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 1000
dec_dim: 512
dec_emb_dropout: 0.2
dec_dropout: 0.1
# Decoding parameters
blank_index: 0
bos_index: 0
eos_index: 0
pad_index: 0
beam_size: 10
nbest: 1
# by default {state,expand}_beam = 2.3 as mention in paper
# https://arxiv.org/abs/1904.02619
state_beam: 2.3
expand_beam: 2.3
lm_weight: 0.50
normalize: !new:speechbrain.processing.features.InputNormalization
norm_type: global
update_until_epoch: 4
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
win_length: !ref <win_length>
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 2
num_layers_per_block: 1
out_channels: (64, 32)
kernel_sizes: (3, 3)
strides: (2, 2)
residuals: (False, False)
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
input_size: 640
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
d_ffn: !ref <d_ffn>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
encoder_module: conformer
attention_type: RelPosMHAXL
normalize_before: True
causal: False
# We must call an encoder wrapper so the decoder isn't run (we don't have any)
enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
# For MTL CTC over the encoder
proj_ctc: !new:speechbrain.nnet.linear.Linear
input_size: !ref <joint_dim>
n_neurons: !ref <output_neurons>
# Define some projection layers to make sure that enc and dec
# output dim are the same before joining
proj_enc: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <joint_dim>
bias: False
proj_dec: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dec_dim>
n_neurons: !ref <joint_dim>
bias: False
emb: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: !ref <output_neurons>
consider_as_one_hot: True
blank_id: !ref <blank_index>
dec: !new:speechbrain.nnet.RNN.LSTM
input_shape: [null, null, !ref <output_neurons> - 1]
hidden_size: !ref <dec_dim>
num_layers: 1
re_init: True
Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
joint: sum # joint [sum | concat]
nonlinearity: !ref <activation>
transducer_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <joint_dim>
n_neurons: !ref <output_neurons>
bias: False
modules:
CNN: !ref <CNN>
enc: !ref <enc>
emb: !ref <emb>
dec: !ref <dec>
Tjoint: !ref <Tjoint>
transducer_lin: !ref <transducer_lin>
normalize: !ref <normalize>
proj_ctc: !ref <proj_ctc>
proj_dec: !ref <proj_dec>
proj_enc: !ref <proj_enc>
model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <enc>, !ref <emb>, !ref <dec>, !ref <proj_enc>, !ref <proj_dec>, !ref <proj_ctc>, !ref <transducer_lin>]
# Tokenizer initialization
tokenizer: !new:sentencepiece.SentencePieceProcessor
Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
tjoint: !ref <Tjoint>
classifier_network: [!ref <transducer_lin>]
blank_id: !ref <blank_index>
beam_size: 1
nbest: 1
Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
tjoint: !ref <Tjoint>
classifier_network: [!ref <transducer_lin>]
blank_id: !ref <blank_index>
beam_size: !ref <beam_size>
nbest: !ref <nbest>
# lm_module: !ref <lm_model>
# lm_weight: !ref <lm_weight>
state_beam: !ref <state_beam>
expand_beam: !ref <expand_beam>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
collect_in: !ref <save_folder>
loadables:
model: !ref <model>
normalizer: !ref <normalize>
tokenizer: !ref <tokenizer>
# inference stuff
make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext
tokenizer_decode_streaming: !name:speechbrain.tokenizers.SentencePiece.spm_decode_preserve_leading_space
make_decoder_streaming_context: !name:speechbrain.decoders.transducer.TransducerGreedySearcherStreamingContext # default constructor
decoding_function: !name:speechbrain.decoders.transducer.TransducerBeamSearcher.transducer_greedy_decode_streaming
- !ref <Greedysearcher> # self
fea_streaming_extractor: !new:speechbrain.lobes.features.StreamingFeatureWrapper
module: !new:speechbrain.nnet.containers.LengthsCapableSequential
- !ref <compute_features>
- !ref <normalize>
- !ref <CNN>
# don't consider normalization as part of the input filter chain.
# normalization will operate at chunk level, which mismatches training
# somewhat, but does not appear to result in noticeable degradation.
properties: !apply:speechbrain.utils.filter_analysis.stack_filter_properties
- [!ref <compute_features>, !ref <CNN>]
|