yifengyu commited on
Commit
58dda8a
1 Parent(s): 24a8ec6

Update model

Browse files
Files changed (30) hide show
  1. README.md +537 -3
  2. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/200epoch.pth +3 -0
  3. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/config.yaml +454 -0
  4. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_backward_time.png +0 -0
  5. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_fake_loss.png +0 -0
  6. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_forward_time.png +0 -0
  7. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_loss.png +0 -0
  8. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_optim_step_time.png +0 -0
  9. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_real_loss.png +0 -0
  10. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_train_time.png +0 -0
  11. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_adv_loss.png +0 -0
  12. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_backward_time.png +0 -0
  13. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_feat_match_loss.png +0 -0
  14. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_forward_time.png +0 -0
  15. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_kl_loss.png +0 -0
  16. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_loss.png +0 -0
  17. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_mel_am_loss.png +0 -0
  18. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_mel_ddsp_loss.png +0 -0
  19. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_mel_loss.png +0 -0
  20. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_optim_step_time.png +0 -0
  21. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_phn_dur_loss.png +0 -0
  22. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_pitch_loss.png +0 -0
  23. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_score_dur_loss.png +0 -0
  24. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_train_time.png +0 -0
  25. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/gpu_max_cached_mem_GB.png +0 -0
  26. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/iter_time.png +0 -0
  27. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/optim0_lr0.png +0 -0
  28. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/optim1_lr0.png +0 -0
  29. exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/train_time.png +0 -0
  30. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,537 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: zh
7
+ datasets:
8
+ - opencpop
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `yifengyu/svs_train_visinger2plus_mert_raw_phn_None_zh_200epoch`
15
+
16
+ This model was trained by jerryuhoo using opencpop recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 4c55d6c9071fb36addcc8426f2befd8f9a1bd11e
26
+ pip install -e .
27
+ cd egs2/opencpop/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model yifengyu/svs_train_visinger2plus_mert_raw_phn_None_zh_200epoch
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: ./conf/tuning/train_visinger_mert.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/24k/svs_train_visinger_mert_raw_phn_None_zh
44
+ ngpu: 1
45
+ seed: 777
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: null
51
+ dist_rank: null
52
+ local_rank: 0
53
+ dist_master_addr: null
54
+ dist_master_port: null
55
+ dist_launcher: null
56
+ multiprocessing_distributed: false
57
+ unused_parameters: true
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: false
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 200
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - train
75
+ - total_count
76
+ - max
77
+ keep_nbest_models: 10
78
+ nbest_averaging_interval: 0
79
+ grad_clip: -1
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: 50
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ create_graph_in_tensorboard: false
91
+ use_wandb: false
92
+ wandb_project: null
93
+ wandb_id: null
94
+ wandb_entity: null
95
+ wandb_name: null
96
+ wandb_model_log_interval: -1
97
+ detect_anomaly: false
98
+ pretrain_path: null
99
+ init_param: []
100
+ ignore_init_mismatch: false
101
+ freeze_param: []
102
+ num_iters_per_epoch: 1000
103
+ batch_size: 4
104
+ valid_batch_size: null
105
+ batch_bins: 1000000
106
+ valid_batch_bins: null
107
+ train_shape_file:
108
+ - exp/24k/svs_stats_raw_phn_None_zh/train/text_shape.phn
109
+ - exp/24k/svs_stats_raw_phn_None_zh/train/singing_shape
110
+ valid_shape_file:
111
+ - exp/24k/svs_stats_raw_phn_None_zh/valid/text_shape.phn
112
+ - exp/24k/svs_stats_raw_phn_None_zh/valid/singing_shape
113
+ batch_type: sorted
114
+ valid_batch_type: null
115
+ fold_length:
116
+ - 150
117
+ - 384000
118
+ sort_in_batch: descending
119
+ sort_batch: descending
120
+ multiple_iterator: false
121
+ chunk_length: 500
122
+ chunk_shift_ratio: 0.5
123
+ num_cache_chunks: 1024
124
+ chunk_excluded_key_prefixes: []
125
+ train_data_path_and_name_and_type:
126
+ - - dump/24k/raw/tr_no_dev/text
127
+ - text
128
+ - text
129
+ - - dump/24k/raw/tr_no_dev/wav.scp
130
+ - singing
131
+ - sound
132
+ - - dump/24k/raw/tr_no_dev/label
133
+ - label
134
+ - duration
135
+ - - dump/24k/raw/tr_no_dev/score.scp
136
+ - score
137
+ - score
138
+ valid_data_path_and_name_and_type:
139
+ - - dump/24k/raw/dev/text
140
+ - text
141
+ - text
142
+ - - dump/24k/raw/dev/wav.scp
143
+ - singing
144
+ - sound
145
+ - - dump/24k/raw/dev/label
146
+ - label
147
+ - duration
148
+ - - dump/24k/raw/dev/score.scp
149
+ - score
150
+ - score
151
+ allow_variable_data_keys: false
152
+ max_cache_size: 0.0
153
+ max_cache_fd: 32
154
+ valid_max_cache_size: null
155
+ exclude_weight_decay: false
156
+ exclude_weight_decay_conf: {}
157
+ optim: adamw
158
+ optim_conf:
159
+ lr: 0.0002
160
+ betas:
161
+ - 0.8
162
+ - 0.99
163
+ eps: 1.0e-09
164
+ weight_decay: 0.0
165
+ scheduler: exponentiallr
166
+ scheduler_conf:
167
+ gamma: 0.998
168
+ optim2: adamw
169
+ optim2_conf:
170
+ lr: 0.0002
171
+ betas:
172
+ - 0.8
173
+ - 0.99
174
+ eps: 1.0e-09
175
+ weight_decay: 0.0
176
+ scheduler2: exponentiallr
177
+ scheduler2_conf:
178
+ gamma: 0.998
179
+ generator_first: false
180
+ input_size: null
181
+ token_list:
182
+ - <blank>
183
+ - <unk>
184
+ - SP
185
+ - i
186
+ - AP
187
+ - e
188
+ - y
189
+ - d
190
+ - w
191
+ - sh
192
+ - ai
193
+ - n
194
+ - x
195
+ - j
196
+ - ian
197
+ - u
198
+ - l
199
+ - h
200
+ - b
201
+ - o
202
+ - zh
203
+ - an
204
+ - ou
205
+ - m
206
+ - q
207
+ - z
208
+ - en
209
+ - g
210
+ - ing
211
+ - ei
212
+ - ao
213
+ - ang
214
+ - uo
215
+ - eng
216
+ - t
217
+ - a
218
+ - ong
219
+ - ui
220
+ - k
221
+ - f
222
+ - r
223
+ - iang
224
+ - ch
225
+ - v
226
+ - in
227
+ - iao
228
+ - ie
229
+ - iu
230
+ - c
231
+ - s
232
+ - van
233
+ - p
234
+ - ve
235
+ - uan
236
+ - uang
237
+ - ia
238
+ - ua
239
+ - uai
240
+ - un
241
+ - er
242
+ - vn
243
+ - iong
244
+ - <sos/eos>
245
+ odim: null
246
+ model_conf: {}
247
+ use_preprocessor: true
248
+ token_type: phn
249
+ bpemodel: null
250
+ non_linguistic_symbols: null
251
+ cleaner: null
252
+ g2p: null
253
+ fs: 24000
254
+ frontend: s3prl
255
+ frontend_conf:
256
+ frontend_conf:
257
+ upstream: hf_mert_custom
258
+ path_or_url: m-a-p/MERT-v1-330M
259
+ download_dir: ./hub
260
+ multilayer_feature: true
261
+ score_feats_extract: syllable_score_feats
262
+ score_feats_extract_conf:
263
+ fs: 24000
264
+ n_fft: 2048
265
+ win_length: 2048
266
+ hop_length: 480
267
+ feats_extract: fbank
268
+ feats_extract_conf:
269
+ n_fft: 2048
270
+ hop_length: 480
271
+ win_length: 2048
272
+ fs: 24000
273
+ fmin: 0
274
+ fmax: 22050
275
+ n_mels: 80
276
+ normalize: null
277
+ normalize_conf: {}
278
+ svs: vits
279
+ svs_conf:
280
+ generator_type: visinger2
281
+ vocoder_generator_type: visinger2
282
+ generator_params:
283
+ hidden_channels: 192
284
+ spks: -1
285
+ global_channels: -1
286
+ segment_size: 20
287
+ text_encoder_attention_heads: 2
288
+ text_encoder_ffn_expand: 4
289
+ text_encoder_blocks: 6
290
+ text_encoder_positionwise_layer_type: conv1d
291
+ text_encoder_positionwise_conv_kernel_size: 3
292
+ text_encoder_positional_encoding_layer_type: rel_pos
293
+ text_encoder_self_attention_layer_type: rel_selfattn
294
+ text_encoder_activation_type: swish
295
+ text_encoder_normalize_before: true
296
+ text_encoder_dropout_rate: 0.1
297
+ text_encoder_positional_dropout_rate: 0.0
298
+ text_encoder_attention_dropout_rate: 0.1
299
+ use_macaron_style_in_text_encoder: true
300
+ use_conformer_conv_in_text_encoder: false
301
+ text_encoder_conformer_kernel_size: -1
302
+ decoder_kernel_size: 7
303
+ decoder_channels: 512
304
+ decoder_upsample_scales:
305
+ - 12
306
+ - 10
307
+ - 2
308
+ - 2
309
+ decoder_upsample_kernel_sizes:
310
+ - 24
311
+ - 20
312
+ - 4
313
+ - 4
314
+ decoder_resblock_kernel_sizes:
315
+ - 3
316
+ - 7
317
+ - 11
318
+ decoder_resblock_dilations:
319
+ - - 1
320
+ - 3
321
+ - 5
322
+ - - 1
323
+ - 3
324
+ - 5
325
+ - - 1
326
+ - 3
327
+ - 5
328
+ use_weight_norm_in_decoder: true
329
+ posterior_encoder_kernel_size: 3
330
+ posterior_encoder_layers: 8
331
+ posterior_encoder_stacks: 1
332
+ posterior_encoder_base_dilation: 1
333
+ posterior_encoder_dropout_rate: 0.0
334
+ use_weight_norm_in_posterior_encoder: true
335
+ flow_flows: -1
336
+ flow_kernel_size: 5
337
+ flow_base_dilation: 1
338
+ flow_layers: 4
339
+ flow_dropout_rate: 0.0
340
+ use_weight_norm_in_flow: true
341
+ use_only_mean_in_flow: true
342
+ use_phoneme_predictor: false
343
+ vocabs: 63
344
+ aux_channels: 80
345
+ generator_type: visinger2
346
+ vocoder_generator_type: visinger2
347
+ fs: 24000
348
+ hop_length: 480
349
+ win_length: 2048
350
+ n_fft: 2048
351
+ discriminator_type: visinger2
352
+ discriminator_params:
353
+ scales: 1
354
+ scale_downsample_pooling: AvgPool1d
355
+ scale_downsample_pooling_params:
356
+ kernel_size: 4
357
+ stride: 2
358
+ padding: 2
359
+ scale_discriminator_params:
360
+ in_channels: 1
361
+ out_channels: 1
362
+ kernel_sizes:
363
+ - 15
364
+ - 41
365
+ - 5
366
+ - 3
367
+ channels: 128
368
+ max_downsample_channels: 1024
369
+ max_groups: 256
370
+ bias: true
371
+ downsample_scales:
372
+ - 4
373
+ - 4
374
+ - 4
375
+ - 4
376
+ nonlinear_activation: LeakyReLU
377
+ nonlinear_activation_params:
378
+ negative_slope: 0.1
379
+ use_weight_norm: true
380
+ use_spectral_norm: false
381
+ follow_official_norm: false
382
+ periods:
383
+ - 2
384
+ - 3
385
+ - 5
386
+ - 7
387
+ - 11
388
+ period_discriminator_params:
389
+ in_channels: 1
390
+ out_channels: 1
391
+ kernel_sizes:
392
+ - 5
393
+ - 3
394
+ channels: 32
395
+ downsample_scales:
396
+ - 3
397
+ - 3
398
+ - 3
399
+ - 3
400
+ - 1
401
+ max_downsample_channels: 1024
402
+ bias: true
403
+ nonlinear_activation: LeakyReLU
404
+ nonlinear_activation_params:
405
+ negative_slope: 0.1
406
+ use_weight_norm: true
407
+ use_spectral_norm: false
408
+ multi_freq_disc_params:
409
+ hidden_channels:
410
+ - 256
411
+ - 256
412
+ - 256
413
+ - 256
414
+ - 256
415
+ domain: double
416
+ mel_scale: true
417
+ divisors:
418
+ - 32
419
+ - 16
420
+ - 8
421
+ - 4
422
+ - 2
423
+ - 1
424
+ - 1
425
+ strides:
426
+ - 1
427
+ - 2
428
+ - 1
429
+ - 2
430
+ - 1
431
+ - 2
432
+ - 1
433
+ sample_rate: 24000
434
+ hop_lengths:
435
+ - 60
436
+ - 120
437
+ - 180
438
+ - 240
439
+ - 300
440
+ - 360
441
+ generator_adv_loss_params:
442
+ average_by_discriminators: false
443
+ loss_type: mse
444
+ discriminator_adv_loss_params:
445
+ average_by_discriminators: false
446
+ loss_type: mse
447
+ feat_match_loss_params:
448
+ average_by_discriminators: false
449
+ average_by_layers: false
450
+ include_final_outputs: true
451
+ mel_loss_params:
452
+ fs: 24000
453
+ n_fft: 2048
454
+ hop_length: 480
455
+ win_length: 2048
456
+ window: hann
457
+ n_mels: 80
458
+ fmin: 0
459
+ fmax: 12000
460
+ log_base: null
461
+ lambda_adv: 1.0
462
+ lambda_mel: 45.0
463
+ lambda_feat_match: 2.0
464
+ lambda_dur: 0.1
465
+ lambda_pitch: 10.0
466
+ lambda_phoneme: 1.0
467
+ lambda_kl: 1.0
468
+ sampling_rate: 24000
469
+ cache_generator_outputs: true
470
+ pitch_extract: dio
471
+ pitch_extract_conf:
472
+ use_token_averaged_f0: false
473
+ use_log_f0: false
474
+ fs: 24000
475
+ n_fft: 2048
476
+ hop_length: 480
477
+ f0max: 800
478
+ f0min: 80
479
+ pitch_normalize: null
480
+ pitch_normalize_conf: {}
481
+ ying_extract: null
482
+ ying_extract_conf: {}
483
+ energy_extract: null
484
+ energy_extract_conf: {}
485
+ energy_normalize: null
486
+ energy_normalize_conf: {}
487
+ required:
488
+ - output_dir
489
+ - token_list
490
+ version: '202304'
491
+ distributed: false
492
+ ```
493
+
494
+ </details>
495
+
496
+
497
+
498
+ ### Citing ESPnet
499
+
500
+ ```BibTex
501
+ @inproceedings{watanabe2018espnet,
502
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
503
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
504
+ year={2018},
505
+ booktitle={Proceedings of Interspeech},
506
+ pages={2207--2211},
507
+ doi={10.21437/Interspeech.2018-1456},
508
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
509
+ }
510
+
511
+
512
+
513
+
514
+
515
+
516
+ @inproceedings{shi22d_interspeech,
517
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
518
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
519
+ year=2022,
520
+ booktitle={Proc. Interspeech 2022},
521
+ pages={4277--4281},
522
+ doi={10.21437/Interspeech.2022-10039}
523
+ }
524
+ ```
525
+
526
+ or arXiv:
527
+
528
+ ```bibtex
529
+ @misc{watanabe2018espnet,
530
+ title={ESPnet: End-to-End Speech Processing Toolkit},
531
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
532
+ year={2018},
533
+ eprint={1804.00015},
534
+ archivePrefix={arXiv},
535
+ primaryClass={cs.CL}
536
+ }
537
+ ```
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/200epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:956aefb90a1201cd6438ecc75cfe6b80f85908a30104ab57a81fbcddc6b0c9af
3
+ size 1744856027
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/config.yaml ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_visinger_mert.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/24k/svs_train_visinger_mert_raw_phn_None_zh
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 200
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 1000
66
+ batch_size: 4
67
+ valid_batch_size: null
68
+ batch_bins: 1000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/24k/svs_stats_raw_phn_None_zh/train/text_shape.phn
72
+ - exp/24k/svs_stats_raw_phn_None_zh/train/singing_shape
73
+ valid_shape_file:
74
+ - exp/24k/svs_stats_raw_phn_None_zh/valid/text_shape.phn
75
+ - exp/24k/svs_stats_raw_phn_None_zh/valid/singing_shape
76
+ batch_type: sorted
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 384000
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ chunk_excluded_key_prefixes: []
88
+ train_data_path_and_name_and_type:
89
+ - - dump/24k/raw/tr_no_dev/text
90
+ - text
91
+ - text
92
+ - - dump/24k/raw/tr_no_dev/wav.scp
93
+ - singing
94
+ - sound
95
+ - - dump/24k/raw/tr_no_dev/label
96
+ - label
97
+ - duration
98
+ - - dump/24k/raw/tr_no_dev/score.scp
99
+ - score
100
+ - score
101
+ valid_data_path_and_name_and_type:
102
+ - - dump/24k/raw/dev/text
103
+ - text
104
+ - text
105
+ - - dump/24k/raw/dev/wav.scp
106
+ - singing
107
+ - sound
108
+ - - dump/24k/raw/dev/label
109
+ - label
110
+ - duration
111
+ - - dump/24k/raw/dev/score.scp
112
+ - score
113
+ - score
114
+ allow_variable_data_keys: false
115
+ max_cache_size: 0.0
116
+ max_cache_fd: 32
117
+ valid_max_cache_size: null
118
+ exclude_weight_decay: false
119
+ exclude_weight_decay_conf: {}
120
+ optim: adamw
121
+ optim_conf:
122
+ lr: 0.0002
123
+ betas:
124
+ - 0.8
125
+ - 0.99
126
+ eps: 1.0e-09
127
+ weight_decay: 0.0
128
+ scheduler: exponentiallr
129
+ scheduler_conf:
130
+ gamma: 0.998
131
+ optim2: adamw
132
+ optim2_conf:
133
+ lr: 0.0002
134
+ betas:
135
+ - 0.8
136
+ - 0.99
137
+ eps: 1.0e-09
138
+ weight_decay: 0.0
139
+ scheduler2: exponentiallr
140
+ scheduler2_conf:
141
+ gamma: 0.998
142
+ generator_first: false
143
+ input_size: null
144
+ token_list:
145
+ - <blank>
146
+ - <unk>
147
+ - SP
148
+ - i
149
+ - AP
150
+ - e
151
+ - y
152
+ - d
153
+ - w
154
+ - sh
155
+ - ai
156
+ - n
157
+ - x
158
+ - j
159
+ - ian
160
+ - u
161
+ - l
162
+ - h
163
+ - b
164
+ - o
165
+ - zh
166
+ - an
167
+ - ou
168
+ - m
169
+ - q
170
+ - z
171
+ - en
172
+ - g
173
+ - ing
174
+ - ei
175
+ - ao
176
+ - ang
177
+ - uo
178
+ - eng
179
+ - t
180
+ - a
181
+ - ong
182
+ - ui
183
+ - k
184
+ - f
185
+ - r
186
+ - iang
187
+ - ch
188
+ - v
189
+ - in
190
+ - iao
191
+ - ie
192
+ - iu
193
+ - c
194
+ - s
195
+ - van
196
+ - p
197
+ - ve
198
+ - uan
199
+ - uang
200
+ - ia
201
+ - ua
202
+ - uai
203
+ - un
204
+ - er
205
+ - vn
206
+ - iong
207
+ - <sos/eos>
208
+ odim: null
209
+ model_conf: {}
210
+ use_preprocessor: true
211
+ token_type: phn
212
+ bpemodel: null
213
+ non_linguistic_symbols: null
214
+ cleaner: null
215
+ g2p: null
216
+ fs: 24000
217
+ frontend: s3prl
218
+ frontend_conf:
219
+ frontend_conf:
220
+ upstream: hf_mert_custom
221
+ path_or_url: m-a-p/MERT-v1-330M
222
+ download_dir: ./hub
223
+ multilayer_feature: true
224
+ score_feats_extract: syllable_score_feats
225
+ score_feats_extract_conf:
226
+ fs: 24000
227
+ n_fft: 2048
228
+ win_length: 2048
229
+ hop_length: 480
230
+ feats_extract: fbank
231
+ feats_extract_conf:
232
+ n_fft: 2048
233
+ hop_length: 480
234
+ win_length: 2048
235
+ fs: 24000
236
+ fmin: 0
237
+ fmax: 22050
238
+ n_mels: 80
239
+ normalize: null
240
+ normalize_conf: {}
241
+ svs: vits
242
+ svs_conf:
243
+ generator_type: visinger2
244
+ vocoder_generator_type: visinger2
245
+ generator_params:
246
+ hidden_channels: 192
247
+ spks: -1
248
+ global_channels: -1
249
+ segment_size: 20
250
+ text_encoder_attention_heads: 2
251
+ text_encoder_ffn_expand: 4
252
+ text_encoder_blocks: 6
253
+ text_encoder_positionwise_layer_type: conv1d
254
+ text_encoder_positionwise_conv_kernel_size: 3
255
+ text_encoder_positional_encoding_layer_type: rel_pos
256
+ text_encoder_self_attention_layer_type: rel_selfattn
257
+ text_encoder_activation_type: swish
258
+ text_encoder_normalize_before: true
259
+ text_encoder_dropout_rate: 0.1
260
+ text_encoder_positional_dropout_rate: 0.0
261
+ text_encoder_attention_dropout_rate: 0.1
262
+ use_macaron_style_in_text_encoder: true
263
+ use_conformer_conv_in_text_encoder: false
264
+ text_encoder_conformer_kernel_size: -1
265
+ decoder_kernel_size: 7
266
+ decoder_channels: 512
267
+ decoder_upsample_scales:
268
+ - 12
269
+ - 10
270
+ - 2
271
+ - 2
272
+ decoder_upsample_kernel_sizes:
273
+ - 24
274
+ - 20
275
+ - 4
276
+ - 4
277
+ decoder_resblock_kernel_sizes:
278
+ - 3
279
+ - 7
280
+ - 11
281
+ decoder_resblock_dilations:
282
+ - - 1
283
+ - 3
284
+ - 5
285
+ - - 1
286
+ - 3
287
+ - 5
288
+ - - 1
289
+ - 3
290
+ - 5
291
+ use_weight_norm_in_decoder: true
292
+ posterior_encoder_kernel_size: 3
293
+ posterior_encoder_layers: 8
294
+ posterior_encoder_stacks: 1
295
+ posterior_encoder_base_dilation: 1
296
+ posterior_encoder_dropout_rate: 0.0
297
+ use_weight_norm_in_posterior_encoder: true
298
+ flow_flows: -1
299
+ flow_kernel_size: 5
300
+ flow_base_dilation: 1
301
+ flow_layers: 4
302
+ flow_dropout_rate: 0.0
303
+ use_weight_norm_in_flow: true
304
+ use_only_mean_in_flow: true
305
+ use_phoneme_predictor: false
306
+ vocabs: 63
307
+ aux_channels: 80
308
+ generator_type: visinger2
309
+ vocoder_generator_type: visinger2
310
+ fs: 24000
311
+ hop_length: 480
312
+ win_length: 2048
313
+ n_fft: 2048
314
+ discriminator_type: visinger2
315
+ discriminator_params:
316
+ scales: 1
317
+ scale_downsample_pooling: AvgPool1d
318
+ scale_downsample_pooling_params:
319
+ kernel_size: 4
320
+ stride: 2
321
+ padding: 2
322
+ scale_discriminator_params:
323
+ in_channels: 1
324
+ out_channels: 1
325
+ kernel_sizes:
326
+ - 15
327
+ - 41
328
+ - 5
329
+ - 3
330
+ channels: 128
331
+ max_downsample_channels: 1024
332
+ max_groups: 256
333
+ bias: true
334
+ downsample_scales:
335
+ - 4
336
+ - 4
337
+ - 4
338
+ - 4
339
+ nonlinear_activation: LeakyReLU
340
+ nonlinear_activation_params:
341
+ negative_slope: 0.1
342
+ use_weight_norm: true
343
+ use_spectral_norm: false
344
+ follow_official_norm: false
345
+ periods:
346
+ - 2
347
+ - 3
348
+ - 5
349
+ - 7
350
+ - 11
351
+ period_discriminator_params:
352
+ in_channels: 1
353
+ out_channels: 1
354
+ kernel_sizes:
355
+ - 5
356
+ - 3
357
+ channels: 32
358
+ downsample_scales:
359
+ - 3
360
+ - 3
361
+ - 3
362
+ - 3
363
+ - 1
364
+ max_downsample_channels: 1024
365
+ bias: true
366
+ nonlinear_activation: LeakyReLU
367
+ nonlinear_activation_params:
368
+ negative_slope: 0.1
369
+ use_weight_norm: true
370
+ use_spectral_norm: false
371
+ multi_freq_disc_params:
372
+ hidden_channels:
373
+ - 256
374
+ - 256
375
+ - 256
376
+ - 256
377
+ - 256
378
+ domain: double
379
+ mel_scale: true
380
+ divisors:
381
+ - 32
382
+ - 16
383
+ - 8
384
+ - 4
385
+ - 2
386
+ - 1
387
+ - 1
388
+ strides:
389
+ - 1
390
+ - 2
391
+ - 1
392
+ - 2
393
+ - 1
394
+ - 2
395
+ - 1
396
+ sample_rate: 24000
397
+ hop_lengths:
398
+ - 60
399
+ - 120
400
+ - 180
401
+ - 240
402
+ - 300
403
+ - 360
404
+ generator_adv_loss_params:
405
+ average_by_discriminators: false
406
+ loss_type: mse
407
+ discriminator_adv_loss_params:
408
+ average_by_discriminators: false
409
+ loss_type: mse
410
+ feat_match_loss_params:
411
+ average_by_discriminators: false
412
+ average_by_layers: false
413
+ include_final_outputs: true
414
+ mel_loss_params:
415
+ fs: 24000
416
+ n_fft: 2048
417
+ hop_length: 480
418
+ win_length: 2048
419
+ window: hann
420
+ n_mels: 80
421
+ fmin: 0
422
+ fmax: 12000
423
+ log_base: null
424
+ lambda_adv: 1.0
425
+ lambda_mel: 45.0
426
+ lambda_feat_match: 2.0
427
+ lambda_dur: 0.1
428
+ lambda_pitch: 10.0
429
+ lambda_phoneme: 1.0
430
+ lambda_kl: 1.0
431
+ sampling_rate: 24000
432
+ cache_generator_outputs: true
433
+ pitch_extract: dio
434
+ pitch_extract_conf:
435
+ use_token_averaged_f0: false
436
+ use_log_f0: false
437
+ fs: 24000
438
+ n_fft: 2048
439
+ hop_length: 480
440
+ f0max: 800
441
+ f0min: 80
442
+ pitch_normalize: null
443
+ pitch_normalize_conf: {}
444
+ ying_extract: null
445
+ ying_extract_conf: {}
446
+ energy_extract: null
447
+ energy_extract_conf: {}
448
+ energy_normalize: null
449
+ energy_normalize_conf: {}
450
+ required:
451
+ - output_dir
452
+ - token_list
453
+ version: '202304'
454
+ distributed: false
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_backward_time.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_fake_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_forward_time.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_optim_step_time.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_real_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/discriminator_train_time.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_adv_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_backward_time.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_feat_match_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_forward_time.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_kl_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_mel_am_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_mel_ddsp_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_mel_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_optim_step_time.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_phn_dur_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_pitch_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_score_dur_loss.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/generator_train_time.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/gpu_max_cached_mem_GB.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/iter_time.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/optim0_lr0.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/optim1_lr0.png ADDED
exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202409'
2
+ files:
3
+ model_file: exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/200epoch.pth
4
+ python: "3.9.18 (main, Sep 11 2023, 13:41:44) \n[GCC 11.2.0]"
5
+ timestamp: 1731045316.275079
6
+ torch: 2.0.0.dev20230206+cu118
7
+ yaml_files:
8
+ train_config: exp/24k/svs_train_visinger2plus_mert_raw_phn_None_zh/config.yaml