File size: 4,921 Bytes
089d567
2493da2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# Generated 2023-06-18 from:
# /kaggle/working/direct-train.yaml
# yamllint disable
# ############################################################################
# Model: Direct SLU
# Encoder: Pre-trained ASR encoder -> LSTM
# Decoder: GRU + beamsearch
# Tokens: BPE with unigram
# losses: NLL
# Training: SLURP
# Authors:  Loren Lugosch, Mirco Ravanelli 2020
# ############################################################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:torch.manual_seed [1986]
# ADD: prepared folder from prev step
prepared_folder: results/prepared
output_folder: results/better_tokenizer/1986
save_folder: results/better_tokenizer/1986/save
train_log: results/better_tokenizer/1986/train_log.txt
log_folder: results/better_tokenizer/1986/log

# Data files
# The SLURP dataset will be automatically downloaded in the specified data_folder
# data_folder: !PLACEHOLDER # e.g, /localscratch/SLURP
data_folder: /slurp/audio
data_folder_rirs: /slurp/audio
train_splits: [train_synthetic, train_real]
csv_train: results/prepared/train-type=direct-sample=0.2.csv
csv_valid: results/prepared/devel-type=direct-sample=0.2.csv
csv_test: results/prepared/test-type=direct-sample=0.2.csv
tokenizer_file: https://www.dropbox.com/s/tmwq12r5vgcsif9/58_unigram.model?dl=1
skip_prep: false

# Training parameters
number_of_epochs: 20
batch_size: 16
lr: 0.0003
# token_type: unigram # ["unigram", "bpe", "char"]
sorting: random
ckpt_interval_minutes: 15 # save checkpoint every N min

# Model parameters
sample_rate: 16000
emb_size: 128
dec_neurons: 512
output_neurons: 58 # index(eos/bos) = 0
ASR_encoder_dim: 512
encoder_dim: 256

# Decoding parameters
bos_index: 0
eos_index: 0
min_decode_ratio: 0.0
max_decode_ratio: 10.0
slu_beam_size: 80
eos_threshold: 1.5
temperature: 1.25

dataloader_opts:
  batch_size: 16
  shuffle: true

epoch_counter: &id009 !new:speechbrain.utils.epoch_loop.EpochCounter

  limit: 20

# Models
asr_model: !apply:speechbrain.pretrained.EncoderDecoderASR.from_hparams
  source: speechbrain/asr-crdnn-rnnlm-librispeech
  run_opts: {device: cuda:0}

slu_enc: &id001 !new:speechbrain.nnet.containers.Sequential
  input_shape: [null, null, 512]
  lstm: !new:speechbrain.nnet.RNN.LSTM
    input_size: 512
    bidirectional: true
    hidden_size: 256
    num_layers: 2
  linear: !new:speechbrain.nnet.linear.Linear
    input_size: 512
    n_neurons: 256

output_emb: &id002 !new:speechbrain.nnet.embedding.Embedding
  num_embeddings: 58
  embedding_dim: 128

dec: &id003 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
  enc_dim: 256
  input_size: 128
  rnn_type: gru
  attn_type: keyvalue
  hidden_size: 512
  attn_dim: 512
  num_layers: 3
  scaling: 1.0
  dropout: 0.0

seq_lin: &id004 !new:speechbrain.nnet.linear.Linear
  input_size: 512
  n_neurons: 58

env_corrupt: &id005 !new:speechbrain.lobes.augment.EnvCorrupt

  openrir_folder: /slurp/audio
  babble_prob: 0.0
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15

modules:
  slu_enc: *id001
  output_emb: *id002
  dec: *id003
  seq_lin: *id004
  env_corrupt: *id005
model: &id007 !new:torch.nn.ModuleList
- [*id001, *id002, *id003, *id004]
tokenizer: &id006 !new:sentencepiece.SentencePieceProcessor

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  collect_in: results/better_tokenizer/1986/save/SLURM_tokenizer
  loadables:
    tokenizer: *id006
  paths:
    tokenizer: https://www.dropbox.com/s/tmwq12r5vgcsif9/58_unigram.model?dl=1

beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
  embedding: *id002
  decoder: *id003
  linear: *id004
  bos_index: 0
  eos_index: 0
  min_decode_ratio: 0.0
  max_decode_ratio: 10.0
  beam_size: 80
  eos_threshold: 1.5
  temperature: 1.25
  using_max_attn_shift: false
  max_attn_shift: 30
  coverage_penalty: 0.

opt_class: !name:torch.optim.Adam
  lr: 0.0003

lr_annealing: &id008 !new:speechbrain.nnet.schedulers.NewBobScheduler
  initial_value: 0.0003
  improvement_threshold: 0.0025
  annealing_factor: 0.8
  patient: 0

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: results/better_tokenizer/1986/save
  recoverables:
    model: *id007
    scheduler: *id008
    counter: *id009
augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 16000
  speeds: [95, 100, 105]

log_softmax: !new:speechbrain.nnet.activations.Softmax
  apply_log: true

seq_cost: !name:speechbrain.nnet.losses.nll_loss
  label_smoothing: 0.1

# DEFAULT: train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
#    save_file: !ref <train_log>
train_logger: !new:speechbrain.utils.train_logger.TensorboardLogger
  save_dir: results/better_tokenizer/1986/log

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
  split_tokens: true