File size: 4,509 Bytes
f0ce3b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# Generated 2021-09-15 from:
# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/convtasnet-whamr.yaml
# yamllint disable
# ################################
# Model: SepFormer for source separation
# https://arxiv.org/abs/2010.13154
#
# Dataset : WHAMR!
# ################################
# Basic parameters
# Seed needs to be set at top of yaml, before objects with parameters are made
#
seed: 3
__set_seed: !apply:torch.manual_seed [3]

# Data params

# the data folder for the wham dataset
# data_folder needs to follow the format:  /yourpath/whamr.
# make sure to use the name whamr at your top folder for the dataset!
data_folder: /network/tmp1/subakany/whamr

# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
# e.g. /yourpath/wsj0-processed/si_tr_s/
# you need to convert the original wsj0 to 8k
# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/

experiment_name: convtasnet-whamr
output_folder: results/convtasnet-whamr/3
train_log: results/convtasnet-whamr/3/train_log.txt
save_folder: results/convtasnet-whamr/3/save

# the file names should start with whamr instead of whamorg
train_data: results/convtasnet-whamr/3/save/whamr_tr.csv
valid_data: results/convtasnet-whamr/3/save/whamr_cv.csv
test_data: results/convtasnet-whamr/3/save/whamr_tt.csv
skip_prep: false

# Experiment params
auto_mix_prec: false # Set it to True for mixed precision
test_only: false
num_spks: 2 # set to 3 for wsj0-3mix
progressbar: true
save_audio: false # Save estimated sources on disk
sample_rate: 8000

# Training parameters
N_epochs: 200
batch_size: 1
lr: 0.00015
clip_grad_norm: 5
loss_upper_lim: 999999  # this is the upper limit for an acceptable loss
# if True, the training sequences are cut to a specified length
limit_training_signal_len: false
# this is the length of sequences if we choose to limit
# the signal length of training sequences
training_signal_len: 32000000

# Set it to True to dynamically create mixtures at training time
dynamic_mixing: true

# Parameters for data augmentation

# rir_path variable points to the directory of the room impulse responses
# e.g. /miniscratch/subakany/rir_wavs
# If the path does not exist, it is created automatically.
rir_path: /miniscratch/subakany/whamr_rirs_wav

use_wavedrop: false
use_speedperturb: true
use_speedperturb_sameforeachsource: false
use_rand_shift: false
min_shift: -8000
max_shift: 8000

speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  perturb_prob: 1.0
  drop_freq_prob: 0.0
  drop_chunk_prob: 0.0
  sample_rate: 8000
  speeds: [95, 100, 105]

wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  perturb_prob: 0.0
  drop_freq_prob: 1.0
  drop_chunk_prob: 1.0
  sample_rate: 8000

# loss thresholding -- this thresholds the training loss
threshold_byloss: true
threshold: -30

# Encoder parameters
N_encoder_out: 256
out_channels: 256
kernel_size: 16
kernel_stride: 8

# Dataloader options
dataloader_opts:
  batch_size: 1
  num_workers: 3


# Specifying the network
Encoder: &id001 !new:speechbrain.lobes.models.dual_path.Encoder
  kernel_size: 16
  out_channels: 256


MaskNet: &id003 !new:speechbrain.lobes.models.conv_tasnet.MaskNet

  N: 256
  B: 256
  H: 512
  P: 3
  X: 6
  R: 4
  C: 2
  norm_type: gLN
  causal: false
  mask_nonlinear: relu

Decoder: &id002 !new:speechbrain.lobes.models.dual_path.Decoder
  in_channels: 256
  out_channels: 1
  kernel_size: 16
  stride: 8
  bias: false


optimizer: !name:torch.optim.Adam
  lr: 0.00015
  weight_decay: 0

loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper

lr_scheduler: &id005 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau

  factor: 0.5
  patience: 2
  dont_halve_until_epoch: 85

epoch_counter: &id004 !new:speechbrain.utils.epoch_loop.EpochCounter
  limit: 200

modules:
  encoder: *id001
  decoder: *id002
  masknet: *id003
save_all_checkpoints: true
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: results/convtasnet-whamr/3/save
  recoverables:
    encoder: *id001
    decoder: *id002
    masknet: *id003
    counter: *id004
    lr_scheduler: *id005
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: results/convtasnet-whamr/3/train_log.txt

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
      encoder: !ref <Encoder>
      masknet: !ref <MaskNet>
      decoder: !ref <Decoder>