File size: 4,427 Bytes
21a93c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# ############################################################################
# model: Seq2Seq
# encoder: CRDNN model
# decoder: GRU + beamsearch
# tokens: BPE (unigram)
# losses: CTC+NLL
# training: Mozilla Common Voice 6.1, Spoken Wikipedia Corpus, M-AILABS Corpus
# authors: Ruhr-University Bochum 2021
#          adapted from 
#            Ju-Chieh Chou, 
#            Mirco Ravanelli,
#            Abdel Heba,
#            Peter Plantinga,
#            Samuele Cornell, 
#            Sung-Lin Yeh, 
#            Titouan Parcollet 2021
# ############################################################################

# set exp name
name: german-asr

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 40

# Model parameters
activation: !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 2
cnn_channels: (64, 128)
inter_layer_pooling_size: (2, 2)
cnn_kernelsize: (3, 3)
time_pooling_size: 4
rnn_class: !name:speechbrain.nnet.RNN.LSTM
rnn_layers: 4
rnn_neurons: 1024
rnn_bidirectional: True
dnn_blocks: 1
dnn_neurons: 1024
emb_size: 1024
dec_neurons: 1024
output_neurons: 5000  # Number of tokens (same as LM and tokenizer)


# Decoding parameters
blank_index: 0
pad_index: -1
bos_index: 1
eos_index: 2
unk_index: 0

min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 30
eos_threshold: 1.5
using_max_attn_shift: True
max_attn_shift: 300
ctc_weight_decode: 0.3
ctc_window_size: 300
coverage_penalty: 1.5
temperature: 1.0


# Feature Extraction
normalizer: !new:speechbrain.processing.features.InputNormalization
   norm_type: global

compute_features: !new:speechbrain.lobes.features.Fbank
   sample_rate: !ref <sample_rate>
   n_fft: !ref <n_fft>
   n_mels: !ref <n_mels>

# Tokenizer
tokenizer: !new:sentencepiece.SentencePieceProcessor

# Encoder
enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
   input_shape: [null, null, !ref <n_mels>]
   activation: !ref <activation>
   dropout: !ref <dropout>
   cnn_blocks: !ref <cnn_blocks>
   cnn_channels: !ref <cnn_channels>
   cnn_kernelsize: !ref <cnn_kernelsize>
   inter_layer_pooling_size: !ref <inter_layer_pooling_size>
   time_pooling: True
   using_2d_pooling: False
   time_pooling_size: !ref <time_pooling_size>
   rnn_class: !ref <rnn_class>
   rnn_layers: !ref <rnn_layers>
   rnn_neurons: !ref <rnn_neurons>
   rnn_bidirectional: !ref <rnn_bidirectional>
   rnn_re_init: True
   dnn_blocks: !ref <dnn_blocks>
   dnn_neurons: !ref <dnn_neurons>
   use_rnnp: True

# Decoder
emb: !new:speechbrain.nnet.embedding.Embedding
   num_embeddings: !ref <output_neurons>
   embedding_dim: !ref <emb_size>
   
dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
   enc_dim: !ref <dnn_neurons>
   input_size: !ref <emb_size>
   rnn_type: gru
   attn_type: location
   hidden_size: !ref <dec_neurons>
   attn_dim: 1024
   num_layers: 1
   scaling: 1.0
   channels: 10
   kernel_size: 100
   re_init: True
   dropout: !ref <dropout>


# Losses
log_softmax: !new:speechbrain.nnet.activations.Softmax
   apply_log: True
   
ctc_lin: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <dnn_neurons>
   n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <dec_neurons>
   n_neurons: !ref <output_neurons>


# Compile model
asr_model: !new:torch.nn.ModuleList
    - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
    
# We compose the inference (encoder) pipeline.
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    input_shape: [null, null, !ref <n_mels>]
    compute_features: !ref <compute_features>
    normalize: !ref <normalizer>
    model: !ref <enc>

# Beam searcher
decoder: !new:speechbrain.decoders.S2SRNNBeamSearcher
    embedding: !ref <emb>
    decoder: !ref <dec>
    linear: !ref <seq_lin>
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <beam_size>
    eos_threshold: !ref <eos_threshold>
    using_max_attn_shift: !ref <using_max_attn_shift>
    max_attn_shift: !ref <max_attn_shift>
    temperature: !ref <temperature>

modules:
    normalizer: !ref <normalizer>
    encoder: !ref <encoder>
    decoder: !ref <decoder>

# Load pretrained models
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
        asr: !ref <asr_model>
        tokenizer: !ref <tokenizer>
        normalizer: !ref <normalizer>