ftshijt commited on
Commit
c361690
1 Parent(s): 82d69a8

Update model

Browse files
Files changed (33) hide show
  1. README.md +546 -3
  2. dump/raw/org/tr_no_dev_org/spk2sid +2 -0
  3. exp/svs_kising_transfer/250epoch.pth +3 -0
  4. exp/svs_kising_transfer/config.yaml +463 -0
  5. exp/svs_kising_transfer/images/discriminator_backward_time.png +0 -0
  6. exp/svs_kising_transfer/images/discriminator_fake_loss.png +0 -0
  7. exp/svs_kising_transfer/images/discriminator_forward_time.png +0 -0
  8. exp/svs_kising_transfer/images/discriminator_loss.png +0 -0
  9. exp/svs_kising_transfer/images/discriminator_optim_step_time.png +0 -0
  10. exp/svs_kising_transfer/images/discriminator_real_loss.png +0 -0
  11. exp/svs_kising_transfer/images/discriminator_train_time.png +0 -0
  12. exp/svs_kising_transfer/images/generator_adv_loss.png +0 -0
  13. exp/svs_kising_transfer/images/generator_backward_time.png +0 -0
  14. exp/svs_kising_transfer/images/generator_feat_match_loss.png +0 -0
  15. exp/svs_kising_transfer/images/generator_forward_time.png +0 -0
  16. exp/svs_kising_transfer/images/generator_kl_loss.png +0 -0
  17. exp/svs_kising_transfer/images/generator_loss.png +0 -0
  18. exp/svs_kising_transfer/images/generator_mel_am_loss.png +0 -0
  19. exp/svs_kising_transfer/images/generator_mel_ddsp_loss.png +0 -0
  20. exp/svs_kising_transfer/images/generator_mel_loss.png +0 -0
  21. exp/svs_kising_transfer/images/generator_optim_step_time.png +0 -0
  22. exp/svs_kising_transfer/images/generator_phn_dur_loss.png +0 -0
  23. exp/svs_kising_transfer/images/generator_pitch_loss.png +0 -0
  24. exp/svs_kising_transfer/images/generator_score_dur_loss.png +0 -0
  25. exp/svs_kising_transfer/images/generator_train_time.png +0 -0
  26. exp/svs_kising_transfer/images/gpu_max_cached_mem_GB.png +0 -0
  27. exp/svs_kising_transfer/images/iter_time.png +0 -0
  28. exp/svs_kising_transfer/images/optim0_lr0.png +0 -0
  29. exp/svs_kising_transfer/images/optim1_lr0.png +0 -0
  30. exp/svs_kising_transfer/images/train_time.png +0 -0
  31. exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz +3 -0
  32. exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz +3 -0
  33. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,546 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: zh
7
+ datasets:
8
+ - kising
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `espnet/acekising_svs_visinger2`
15
+
16
+ This model was trained by ftshijt using kising recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout ba4880118d5249e5dd92e89d107280a0d4f317e8
26
+ pip install -e .
27
+ cd egs2/kising/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/acekising_svs_visinger2
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_visinger2_all.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/svs_kising_transfer
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 0
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 500
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - train
77
+ - total_count
78
+ - max
79
+ keep_nbest_models: 10
80
+ nbest_averaging_interval: 0
81
+ grad_clip: -1
82
+ grad_clip_type: 2.0
83
+ grad_noise: false
84
+ accum_grad: 1
85
+ no_forward_run: false
86
+ resume: true
87
+ train_dtype: float32
88
+ use_amp: false
89
+ log_interval: 50
90
+ use_matplotlib: true
91
+ use_tensorboard: true
92
+ create_graph_in_tensorboard: false
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ use_lora: false
101
+ save_lora_only: true
102
+ lora_conf: {}
103
+ pretrain_path: null
104
+ init_param:
105
+ - /ocean/projects/cis210027p/jiatong/svs/espnet/egs2/kising/svs1/exp/svs_train_visinger2_all_raw_phn_None_zh/latest.pth
106
+ ignore_init_mismatch: true
107
+ freeze_param: []
108
+ num_iters_per_epoch: 1000
109
+ batch_size: 4
110
+ valid_batch_size: null
111
+ batch_bins: 1000000
112
+ valid_batch_bins: null
113
+ train_shape_file:
114
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
115
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
116
+ valid_shape_file:
117
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
118
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
119
+ batch_type: sorted
120
+ valid_batch_type: null
121
+ fold_length:
122
+ - 150
123
+ - 409600
124
+ sort_in_batch: descending
125
+ shuffle_within_batch: false
126
+ sort_batch: descending
127
+ multiple_iterator: false
128
+ chunk_length: 500
129
+ chunk_shift_ratio: 0.5
130
+ num_cache_chunks: 1024
131
+ chunk_excluded_key_prefixes: []
132
+ chunk_default_fs: null
133
+ train_data_path_and_name_and_type:
134
+ - - dump/raw/tr_no_dev_org/text
135
+ - text
136
+ - text
137
+ - - dump/raw/tr_no_dev_org/wav.scp
138
+ - singing
139
+ - sound
140
+ - - dump/raw/tr_no_dev_org/label
141
+ - label
142
+ - duration
143
+ - - dump/raw/tr_no_dev_org/score.scp
144
+ - score
145
+ - score
146
+ - - dump/raw/tr_no_dev_org/utt2sid
147
+ - sids
148
+ - text_int
149
+ valid_data_path_and_name_and_type:
150
+ - - dump/raw/dev_org/text
151
+ - text
152
+ - text
153
+ - - dump/raw/dev_org/wav.scp
154
+ - singing
155
+ - sound
156
+ - - dump/raw/dev_org/label
157
+ - label
158
+ - duration
159
+ - - dump/raw/dev_org/score.scp
160
+ - score
161
+ - score
162
+ - - dump/raw/dev_org/utt2sid
163
+ - sids
164
+ - text_int
165
+ allow_variable_data_keys: false
166
+ max_cache_size: 0.0
167
+ max_cache_fd: 32
168
+ allow_multi_rates: false
169
+ valid_max_cache_size: null
170
+ exclude_weight_decay: false
171
+ exclude_weight_decay_conf: {}
172
+ optim: adamw
173
+ optim_conf:
174
+ lr: 0.0002
175
+ betas:
176
+ - 0.8
177
+ - 0.99
178
+ eps: 1.0e-09
179
+ weight_decay: 0.0
180
+ scheduler: exponentiallr
181
+ scheduler_conf:
182
+ gamma: 0.998
183
+ optim2: adamw
184
+ optim2_conf:
185
+ lr: 0.0002
186
+ betas:
187
+ - 0.8
188
+ - 0.99
189
+ eps: 1.0e-09
190
+ weight_decay: 0.0
191
+ scheduler2: exponentiallr
192
+ scheduler2_conf:
193
+ gamma: 0.998
194
+ generator_first: true
195
+ token_list:
196
+ - <blank>
197
+ - <unk>
198
+ - i
199
+ - uo
200
+ - e
201
+ - uei
202
+ - ian
203
+ - a
204
+ - u
205
+ - d
206
+ - sh
207
+ - ai
208
+ - x
209
+ - l
210
+ - n
211
+ - eng
212
+ - an
213
+ - ing
214
+ - h
215
+ - iii
216
+ - j
217
+ - ao
218
+ - iou
219
+ - ong
220
+ - iang
221
+ - in
222
+ - zh
223
+ - iao
224
+ - ang
225
+ - en
226
+ - v
227
+ - ie
228
+ - ei
229
+ - m
230
+ - g
231
+ - q
232
+ - t
233
+ - ou
234
+ - f
235
+ - ua
236
+ - z
237
+ - r
238
+ - ch
239
+ - uan
240
+ - b
241
+ - ia
242
+ - k
243
+ - uang
244
+ - ve
245
+ - c
246
+ - van
247
+ - s
248
+ - uen
249
+ - o
250
+ - ii
251
+ - p
252
+ - iong
253
+ - er
254
+ - uai
255
+ - vn
256
+ - ho
257
+ - <sos/eos>
258
+ odim: null
259
+ model_conf: {}
260
+ use_preprocessor: true
261
+ token_type: phn
262
+ bpemodel: null
263
+ non_linguistic_symbols: null
264
+ cleaner: null
265
+ g2p: null
266
+ fs: 44100
267
+ score_feats_extract: syllable_score_feats
268
+ score_feats_extract_conf:
269
+ fs: 44100
270
+ n_fft: 2048
271
+ win_length: 2048
272
+ hop_length: 512
273
+ feats_extract: fbank
274
+ feats_extract_conf:
275
+ n_fft: 2048
276
+ hop_length: 512
277
+ win_length: 2048
278
+ fs: 44100
279
+ fmin: 0
280
+ fmax: 22050
281
+ n_mels: 80
282
+ normalize: global_mvn
283
+ normalize_conf:
284
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
285
+ svs: vits
286
+ svs_conf:
287
+ generator_type: visinger2
288
+ vocoder_generator_type: visinger2
289
+ generator_params:
290
+ hidden_channels: 192
291
+ spks: 37
292
+ global_channels: 256
293
+ segment_size: 20
294
+ text_encoder_attention_heads: 2
295
+ text_encoder_ffn_expand: 4
296
+ text_encoder_blocks: 6
297
+ text_encoder_positionwise_layer_type: conv1d
298
+ text_encoder_positionwise_conv_kernel_size: 3
299
+ text_encoder_positional_encoding_layer_type: rel_pos
300
+ text_encoder_self_attention_layer_type: rel_selfattn
301
+ text_encoder_activation_type: swish
302
+ text_encoder_normalize_before: true
303
+ text_encoder_dropout_rate: 0.1
304
+ text_encoder_positional_dropout_rate: 0.0
305
+ text_encoder_attention_dropout_rate: 0.1
306
+ use_macaron_style_in_text_encoder: true
307
+ use_conformer_conv_in_text_encoder: false
308
+ text_encoder_conformer_kernel_size: -1
309
+ decoder_kernel_size: 7
310
+ decoder_channels: 256
311
+ decoder_upsample_scales:
312
+ - 8
313
+ - 8
314
+ - 4
315
+ - 2
316
+ decoder_upsample_kernel_sizes:
317
+ - 16
318
+ - 16
319
+ - 8
320
+ - 4
321
+ n_harmonic: 64
322
+ decoder_resblock_kernel_sizes:
323
+ - 3
324
+ - 7
325
+ - 11
326
+ decoder_resblock_dilations:
327
+ - - 1
328
+ - 3
329
+ - 5
330
+ - - 1
331
+ - 3
332
+ - 5
333
+ - - 1
334
+ - 3
335
+ - 5
336
+ use_weight_norm_in_decoder: true
337
+ posterior_encoder_kernel_size: 3
338
+ posterior_encoder_layers: 8
339
+ posterior_encoder_stacks: 1
340
+ posterior_encoder_base_dilation: 1
341
+ posterior_encoder_dropout_rate: 0.0
342
+ use_weight_norm_in_posterior_encoder: true
343
+ flow_flows: -1
344
+ flow_kernel_size: 5
345
+ flow_base_dilation: 1
346
+ flow_layers: 4
347
+ flow_dropout_rate: 0.0
348
+ use_weight_norm_in_flow: true
349
+ use_only_mean_in_flow: true
350
+ use_phoneme_predictor: false
351
+ vocabs: 62
352
+ aux_channels: 80
353
+ generator_type: visinger2
354
+ vocoder_generator_type: visinger2
355
+ fs: 44100
356
+ hop_length: 512
357
+ win_length: 2048
358
+ n_fft: 2048
359
+ discriminator_type: visinger2
360
+ discriminator_params:
361
+ scales: 1
362
+ scale_downsample_pooling: AvgPool1d
363
+ scale_downsample_pooling_params:
364
+ kernel_size: 4
365
+ stride: 2
366
+ padding: 2
367
+ scale_discriminator_params:
368
+ in_channels: 1
369
+ out_channels: 1
370
+ kernel_sizes:
371
+ - 15
372
+ - 41
373
+ - 5
374
+ - 3
375
+ channels: 128
376
+ max_downsample_channels: 1024
377
+ max_groups: 256
378
+ bias: true
379
+ downsample_scales:
380
+ - 4
381
+ - 4
382
+ - 4
383
+ - 4
384
+ nonlinear_activation: LeakyReLU
385
+ nonlinear_activation_params:
386
+ negative_slope: 0.1
387
+ use_weight_norm: true
388
+ use_spectral_norm: false
389
+ follow_official_norm: false
390
+ periods:
391
+ - 2
392
+ - 3
393
+ - 5
394
+ - 7
395
+ - 11
396
+ period_discriminator_params:
397
+ in_channels: 1
398
+ out_channels: 1
399
+ kernel_sizes:
400
+ - 5
401
+ - 3
402
+ channels: 32
403
+ downsample_scales:
404
+ - 3
405
+ - 3
406
+ - 3
407
+ - 3
408
+ - 1
409
+ max_downsample_channels: 1024
410
+ bias: true
411
+ nonlinear_activation: LeakyReLU
412
+ nonlinear_activation_params:
413
+ negative_slope: 0.1
414
+ use_weight_norm: true
415
+ use_spectral_norm: false
416
+ multi_freq_disc_params:
417
+ hidden_channels:
418
+ - 256
419
+ - 256
420
+ - 256
421
+ - 256
422
+ - 256
423
+ domain: double
424
+ mel_scale: true
425
+ divisors:
426
+ - 32
427
+ - 16
428
+ - 8
429
+ - 4
430
+ - 2
431
+ - 1
432
+ - 1
433
+ strides:
434
+ - 1
435
+ - 2
436
+ - 1
437
+ - 2
438
+ - 1
439
+ - 2
440
+ - 1
441
+ sample_rate: 44100
442
+ hop_lengths:
443
+ - 110
444
+ - 220
445
+ - 330
446
+ - 441
447
+ - 551
448
+ - 661
449
+ generator_adv_loss_params:
450
+ average_by_discriminators: false
451
+ loss_type: mse
452
+ discriminator_adv_loss_params:
453
+ average_by_discriminators: false
454
+ loss_type: mse
455
+ feat_match_loss_params:
456
+ average_by_discriminators: false
457
+ average_by_layers: false
458
+ include_final_outputs: true
459
+ mel_loss_params:
460
+ fs: 44100
461
+ n_fft: 2048
462
+ hop_length: 512
463
+ win_length: 2048
464
+ window: hann
465
+ n_mels: 80
466
+ fmin: 0
467
+ fmax: 22050
468
+ log_base: null
469
+ lambda_adv: 1.0
470
+ lambda_mel: 45.0
471
+ lambda_feat_match: 2.0
472
+ lambda_dur: 0.1
473
+ lambda_pitch: 10.0
474
+ lambda_phoneme: 1.0
475
+ lambda_kl: 1.0
476
+ sampling_rate: 44100
477
+ cache_generator_outputs: true
478
+ pitch_extract: dio
479
+ pitch_extract_conf:
480
+ use_token_averaged_f0: false
481
+ use_log_f0: false
482
+ fs: 44100
483
+ n_fft: 2048
484
+ hop_length: 512
485
+ f0max: 800
486
+ f0min: 80
487
+ pitch_normalize: null
488
+ pitch_normalize_conf:
489
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
490
+ ying_extract: null
491
+ ying_extract_conf: {}
492
+ energy_extract: null
493
+ energy_extract_conf: {}
494
+ energy_normalize: null
495
+ energy_normalize_conf: {}
496
+ required:
497
+ - output_dir
498
+ - token_list
499
+ version: '202310'
500
+ distributed: false
501
+ ```
502
+
503
+ </details>
504
+
505
+
506
+
507
+ ### Citing ESPnet
508
+
509
+ ```BibTex
510
+ @inproceedings{watanabe2018espnet,
511
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
512
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
513
+ year={2018},
514
+ booktitle={Proceedings of Interspeech},
515
+ pages={2207--2211},
516
+ doi={10.21437/Interspeech.2018-1456},
517
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
518
+ }
519
+
520
+
521
+
522
+
523
+
524
+
525
+ @inproceedings{shi22d_interspeech,
526
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
527
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
528
+ year=2022,
529
+ booktitle={Proc. Interspeech 2022},
530
+ pages={4277--4281},
531
+ doi={10.21437/Interspeech.2022-10039}
532
+ }
533
+ ```
534
+
535
+ or arXiv:
536
+
537
+ ```bibtex
538
+ @misc{watanabe2018espnet,
539
+ title={ESPnet: End-to-End Speech Processing Toolkit},
540
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
541
+ year={2018},
542
+ eprint={1804.00015},
543
+ archivePrefix={arXiv},
544
+ primaryClass={cs.CL}
545
+ }
546
+ ```
dump/raw/org/tr_no_dev_org/spk2sid ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <unk> 0
2
+ original 1
exp/svs_kising_transfer/250epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33f575b11eb2e6a1e92b90e80d6fba620fe91180acbd04b11cc467eae472cc8
3
+ size 448204507
exp/svs_kising_transfer/config.yaml ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_visinger2_all.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_kising_transfer
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 0
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 500
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param:
68
+ - /ocean/projects/cis210027p/jiatong/svs/espnet/egs2/kising/svs1/exp/svs_train_visinger2_all_raw_phn_None_zh/latest.pth
69
+ ignore_init_mismatch: true
70
+ freeze_param: []
71
+ num_iters_per_epoch: 1000
72
+ batch_size: 4
73
+ valid_batch_size: null
74
+ batch_bins: 1000000
75
+ valid_batch_bins: null
76
+ train_shape_file:
77
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
78
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
79
+ valid_shape_file:
80
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
81
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
82
+ batch_type: sorted
83
+ valid_batch_type: null
84
+ fold_length:
85
+ - 150
86
+ - 409600
87
+ sort_in_batch: descending
88
+ shuffle_within_batch: false
89
+ sort_batch: descending
90
+ multiple_iterator: false
91
+ chunk_length: 500
92
+ chunk_shift_ratio: 0.5
93
+ num_cache_chunks: 1024
94
+ chunk_excluded_key_prefixes: []
95
+ chunk_default_fs: null
96
+ train_data_path_and_name_and_type:
97
+ - - dump/raw/tr_no_dev_org/text
98
+ - text
99
+ - text
100
+ - - dump/raw/tr_no_dev_org/wav.scp
101
+ - singing
102
+ - sound
103
+ - - dump/raw/tr_no_dev_org/label
104
+ - label
105
+ - duration
106
+ - - dump/raw/tr_no_dev_org/score.scp
107
+ - score
108
+ - score
109
+ - - dump/raw/tr_no_dev_org/utt2sid
110
+ - sids
111
+ - text_int
112
+ valid_data_path_and_name_and_type:
113
+ - - dump/raw/dev_org/text
114
+ - text
115
+ - text
116
+ - - dump/raw/dev_org/wav.scp
117
+ - singing
118
+ - sound
119
+ - - dump/raw/dev_org/label
120
+ - label
121
+ - duration
122
+ - - dump/raw/dev_org/score.scp
123
+ - score
124
+ - score
125
+ - - dump/raw/dev_org/utt2sid
126
+ - sids
127
+ - text_int
128
+ allow_variable_data_keys: false
129
+ max_cache_size: 0.0
130
+ max_cache_fd: 32
131
+ allow_multi_rates: false
132
+ valid_max_cache_size: null
133
+ exclude_weight_decay: false
134
+ exclude_weight_decay_conf: {}
135
+ optim: adamw
136
+ optim_conf:
137
+ lr: 0.0002
138
+ betas:
139
+ - 0.8
140
+ - 0.99
141
+ eps: 1.0e-09
142
+ weight_decay: 0.0
143
+ scheduler: exponentiallr
144
+ scheduler_conf:
145
+ gamma: 0.998
146
+ optim2: adamw
147
+ optim2_conf:
148
+ lr: 0.0002
149
+ betas:
150
+ - 0.8
151
+ - 0.99
152
+ eps: 1.0e-09
153
+ weight_decay: 0.0
154
+ scheduler2: exponentiallr
155
+ scheduler2_conf:
156
+ gamma: 0.998
157
+ generator_first: true
158
+ token_list:
159
+ - <blank>
160
+ - <unk>
161
+ - i
162
+ - uo
163
+ - e
164
+ - uei
165
+ - ian
166
+ - a
167
+ - u
168
+ - d
169
+ - sh
170
+ - ai
171
+ - x
172
+ - l
173
+ - n
174
+ - eng
175
+ - an
176
+ - ing
177
+ - h
178
+ - iii
179
+ - j
180
+ - ao
181
+ - iou
182
+ - ong
183
+ - iang
184
+ - in
185
+ - zh
186
+ - iao
187
+ - ang
188
+ - en
189
+ - v
190
+ - ie
191
+ - ei
192
+ - m
193
+ - g
194
+ - q
195
+ - t
196
+ - ou
197
+ - f
198
+ - ua
199
+ - z
200
+ - r
201
+ - ch
202
+ - uan
203
+ - b
204
+ - ia
205
+ - k
206
+ - uang
207
+ - ve
208
+ - c
209
+ - van
210
+ - s
211
+ - uen
212
+ - o
213
+ - ii
214
+ - p
215
+ - iong
216
+ - er
217
+ - uai
218
+ - vn
219
+ - ho
220
+ - <sos/eos>
221
+ odim: null
222
+ model_conf: {}
223
+ use_preprocessor: true
224
+ token_type: phn
225
+ bpemodel: null
226
+ non_linguistic_symbols: null
227
+ cleaner: null
228
+ g2p: null
229
+ fs: 44100
230
+ score_feats_extract: syllable_score_feats
231
+ score_feats_extract_conf:
232
+ fs: 44100
233
+ n_fft: 2048
234
+ win_length: 2048
235
+ hop_length: 512
236
+ feats_extract: fbank
237
+ feats_extract_conf:
238
+ n_fft: 2048
239
+ hop_length: 512
240
+ win_length: 2048
241
+ fs: 44100
242
+ fmin: 0
243
+ fmax: 22050
244
+ n_mels: 80
245
+ normalize: global_mvn
246
+ normalize_conf:
247
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
248
+ svs: vits
249
+ svs_conf:
250
+ generator_type: visinger2
251
+ vocoder_generator_type: visinger2
252
+ generator_params:
253
+ hidden_channels: 192
254
+ spks: 37
255
+ global_channels: 256
256
+ segment_size: 20
257
+ text_encoder_attention_heads: 2
258
+ text_encoder_ffn_expand: 4
259
+ text_encoder_blocks: 6
260
+ text_encoder_positionwise_layer_type: conv1d
261
+ text_encoder_positionwise_conv_kernel_size: 3
262
+ text_encoder_positional_encoding_layer_type: rel_pos
263
+ text_encoder_self_attention_layer_type: rel_selfattn
264
+ text_encoder_activation_type: swish
265
+ text_encoder_normalize_before: true
266
+ text_encoder_dropout_rate: 0.1
267
+ text_encoder_positional_dropout_rate: 0.0
268
+ text_encoder_attention_dropout_rate: 0.1
269
+ use_macaron_style_in_text_encoder: true
270
+ use_conformer_conv_in_text_encoder: false
271
+ text_encoder_conformer_kernel_size: -1
272
+ decoder_kernel_size: 7
273
+ decoder_channels: 256
274
+ decoder_upsample_scales:
275
+ - 8
276
+ - 8
277
+ - 4
278
+ - 2
279
+ decoder_upsample_kernel_sizes:
280
+ - 16
281
+ - 16
282
+ - 8
283
+ - 4
284
+ n_harmonic: 64
285
+ decoder_resblock_kernel_sizes:
286
+ - 3
287
+ - 7
288
+ - 11
289
+ decoder_resblock_dilations:
290
+ - - 1
291
+ - 3
292
+ - 5
293
+ - - 1
294
+ - 3
295
+ - 5
296
+ - - 1
297
+ - 3
298
+ - 5
299
+ use_weight_norm_in_decoder: true
300
+ posterior_encoder_kernel_size: 3
301
+ posterior_encoder_layers: 8
302
+ posterior_encoder_stacks: 1
303
+ posterior_encoder_base_dilation: 1
304
+ posterior_encoder_dropout_rate: 0.0
305
+ use_weight_norm_in_posterior_encoder: true
306
+ flow_flows: -1
307
+ flow_kernel_size: 5
308
+ flow_base_dilation: 1
309
+ flow_layers: 4
310
+ flow_dropout_rate: 0.0
311
+ use_weight_norm_in_flow: true
312
+ use_only_mean_in_flow: true
313
+ use_phoneme_predictor: false
314
+ vocabs: 62
315
+ aux_channels: 80
316
+ generator_type: visinger2
317
+ vocoder_generator_type: visinger2
318
+ fs: 44100
319
+ hop_length: 512
320
+ win_length: 2048
321
+ n_fft: 2048
322
+ discriminator_type: visinger2
323
+ discriminator_params:
324
+ scales: 1
325
+ scale_downsample_pooling: AvgPool1d
326
+ scale_downsample_pooling_params:
327
+ kernel_size: 4
328
+ stride: 2
329
+ padding: 2
330
+ scale_discriminator_params:
331
+ in_channels: 1
332
+ out_channels: 1
333
+ kernel_sizes:
334
+ - 15
335
+ - 41
336
+ - 5
337
+ - 3
338
+ channels: 128
339
+ max_downsample_channels: 1024
340
+ max_groups: 256
341
+ bias: true
342
+ downsample_scales:
343
+ - 4
344
+ - 4
345
+ - 4
346
+ - 4
347
+ nonlinear_activation: LeakyReLU
348
+ nonlinear_activation_params:
349
+ negative_slope: 0.1
350
+ use_weight_norm: true
351
+ use_spectral_norm: false
352
+ follow_official_norm: false
353
+ periods:
354
+ - 2
355
+ - 3
356
+ - 5
357
+ - 7
358
+ - 11
359
+ period_discriminator_params:
360
+ in_channels: 1
361
+ out_channels: 1
362
+ kernel_sizes:
363
+ - 5
364
+ - 3
365
+ channels: 32
366
+ downsample_scales:
367
+ - 3
368
+ - 3
369
+ - 3
370
+ - 3
371
+ - 1
372
+ max_downsample_channels: 1024
373
+ bias: true
374
+ nonlinear_activation: LeakyReLU
375
+ nonlinear_activation_params:
376
+ negative_slope: 0.1
377
+ use_weight_norm: true
378
+ use_spectral_norm: false
379
+ multi_freq_disc_params:
380
+ hidden_channels:
381
+ - 256
382
+ - 256
383
+ - 256
384
+ - 256
385
+ - 256
386
+ domain: double
387
+ mel_scale: true
388
+ divisors:
389
+ - 32
390
+ - 16
391
+ - 8
392
+ - 4
393
+ - 2
394
+ - 1
395
+ - 1
396
+ strides:
397
+ - 1
398
+ - 2
399
+ - 1
400
+ - 2
401
+ - 1
402
+ - 2
403
+ - 1
404
+ sample_rate: 44100
405
+ hop_lengths:
406
+ - 110
407
+ - 220
408
+ - 330
409
+ - 441
410
+ - 551
411
+ - 661
412
+ generator_adv_loss_params:
413
+ average_by_discriminators: false
414
+ loss_type: mse
415
+ discriminator_adv_loss_params:
416
+ average_by_discriminators: false
417
+ loss_type: mse
418
+ feat_match_loss_params:
419
+ average_by_discriminators: false
420
+ average_by_layers: false
421
+ include_final_outputs: true
422
+ mel_loss_params:
423
+ fs: 44100
424
+ n_fft: 2048
425
+ hop_length: 512
426
+ win_length: 2048
427
+ window: hann
428
+ n_mels: 80
429
+ fmin: 0
430
+ fmax: 22050
431
+ log_base: null
432
+ lambda_adv: 1.0
433
+ lambda_mel: 45.0
434
+ lambda_feat_match: 2.0
435
+ lambda_dur: 0.1
436
+ lambda_pitch: 10.0
437
+ lambda_phoneme: 1.0
438
+ lambda_kl: 1.0
439
+ sampling_rate: 44100
440
+ cache_generator_outputs: true
441
+ pitch_extract: dio
442
+ pitch_extract_conf:
443
+ use_token_averaged_f0: false
444
+ use_log_f0: false
445
+ fs: 44100
446
+ n_fft: 2048
447
+ hop_length: 512
448
+ f0max: 800
449
+ f0min: 80
450
+ pitch_normalize: null
451
+ pitch_normalize_conf:
452
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
453
+ ying_extract: null
454
+ ying_extract_conf: {}
455
+ energy_extract: null
456
+ energy_extract_conf: {}
457
+ energy_normalize: null
458
+ energy_normalize_conf: {}
459
+ required:
460
+ - output_dir
461
+ - token_list
462
+ version: '202310'
463
+ distributed: false
exp/svs_kising_transfer/images/discriminator_backward_time.png ADDED
exp/svs_kising_transfer/images/discriminator_fake_loss.png ADDED
exp/svs_kising_transfer/images/discriminator_forward_time.png ADDED
exp/svs_kising_transfer/images/discriminator_loss.png ADDED
exp/svs_kising_transfer/images/discriminator_optim_step_time.png ADDED
exp/svs_kising_transfer/images/discriminator_real_loss.png ADDED
exp/svs_kising_transfer/images/discriminator_train_time.png ADDED
exp/svs_kising_transfer/images/generator_adv_loss.png ADDED
exp/svs_kising_transfer/images/generator_backward_time.png ADDED
exp/svs_kising_transfer/images/generator_feat_match_loss.png ADDED
exp/svs_kising_transfer/images/generator_forward_time.png ADDED
exp/svs_kising_transfer/images/generator_kl_loss.png ADDED
exp/svs_kising_transfer/images/generator_loss.png ADDED
exp/svs_kising_transfer/images/generator_mel_am_loss.png ADDED
exp/svs_kising_transfer/images/generator_mel_ddsp_loss.png ADDED
exp/svs_kising_transfer/images/generator_mel_loss.png ADDED
exp/svs_kising_transfer/images/generator_optim_step_time.png ADDED
exp/svs_kising_transfer/images/generator_phn_dur_loss.png ADDED
exp/svs_kising_transfer/images/generator_pitch_loss.png ADDED
exp/svs_kising_transfer/images/generator_score_dur_loss.png ADDED
exp/svs_kising_transfer/images/generator_train_time.png ADDED
exp/svs_kising_transfer/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_kising_transfer/images/iter_time.png ADDED
exp/svs_kising_transfer/images/optim0_lr0.png ADDED
exp/svs_kising_transfer/images/optim1_lr0.png ADDED
exp/svs_kising_transfer/images/train_time.png ADDED
exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd1b9f41a013fa97493319da5ed810e241d2c6491d04c8ec9281a7341f05e9d
3
+ size 1402
exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbc34870cd97069ce296e3d556f916439d0630281c9ef3d309d9ae11ed67151d
3
+ size 770
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ model_file: exp/svs_kising_transfer/250epoch.pth
4
+ python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
+ timestamp: 1726111216.686509
6
+ torch: 1.13.1+cu117
7
+ yaml_files:
8
+ train_config: exp/svs_kising_transfer/config.yaml