zxdu20 commited on
Commit
9cc290e
1 Parent(s): 8859ed8

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. config.yaml +130 -0
  2. flow.pt +3 -0
  3. hift.pt +3 -0
config.yaml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 22050
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1024
11
+ llm_output_size: 1024
12
+ spk_embed_dim: 192
13
+
14
+ # model params
15
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
+ # for system/third_party class/function, we do not require this.
17
+ llm: !new:cosyvoice.llm.llm.TransformerLM
18
+ text_encoder_input_size: !ref <text_encoder_input_size>
19
+ llm_input_size: !ref <llm_input_size>
20
+ llm_output_size: !ref <llm_output_size>
21
+ text_token_size: 51866
22
+ speech_token_size: 4096
23
+ length_normalized_loss: True
24
+ lsm_weight: 0
25
+ spk_embed_dim: !ref <spk_embed_dim>
26
+ text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
27
+ input_size: !ref <text_encoder_input_size>
28
+ output_size: 1024
29
+ attention_heads: 8
30
+ linear_units: 2048
31
+ num_blocks: 3
32
+ dropout_rate: 0.1
33
+ positional_dropout_rate: 0.1
34
+ attention_dropout_rate: 0
35
+ normalize_before: True
36
+ input_layer: 'linear'
37
+ pos_enc_layer_type: 'rel_pos_espnet'
38
+ selfattention_layer_type: 'rel_selfattn'
39
+ use_cnn_module: False
40
+ macaron_style: False
41
+ use_dynamic_chunk: False
42
+ use_dynamic_left_chunk: False
43
+ static_chunk_size: 1
44
+ llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
45
+ input_size: !ref <llm_input_size>
46
+ output_size: !ref <llm_output_size>
47
+ attention_heads: 8
48
+ linear_units: 2048
49
+ num_blocks: 7
50
+ dropout_rate: 0.1
51
+ positional_dropout_rate: 0.1
52
+ attention_dropout_rate: 0
53
+ input_layer: 'linear_legacy'
54
+ pos_enc_layer_type: 'rel_pos_espnet'
55
+ selfattention_layer_type: 'rel_selfattn'
56
+ static_chunk_size: 1
57
+
58
+ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
59
+ input_size: 512
60
+ output_size: 80
61
+ spk_embed_dim: !ref <spk_embed_dim>
62
+ output_type: 'mel'
63
+ vocab_size: 16384
64
+ input_frame_rate: 12.5
65
+ only_mask_loss: True
66
+ encoder: !new:cosyvoice.transformer.encoder.BlockConformerEncoder
67
+ output_size: 512
68
+ attention_heads: 8
69
+ linear_units: 2048
70
+ num_blocks: 6
71
+ dropout_rate: 0.1
72
+ positional_dropout_rate: 0.1
73
+ attention_dropout_rate: 0.1
74
+ normalize_before: True
75
+ input_layer: 'linear'
76
+ pos_enc_layer_type: 'rel_pos_espnet'
77
+ selfattention_layer_type: 'block_rel_selfattn'
78
+ block_size: 10
79
+ input_size: 512
80
+ use_cnn_module: False
81
+ macaron_style: False
82
+ length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
83
+ channels: 80
84
+ sampling_ratios: [1, 1, 1, 1]
85
+ decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
86
+ in_channels: 240
87
+ n_spks: 1
88
+ spk_emb_dim: 80
89
+ cfm_params: !new:omegaconf.DictConfig
90
+ content:
91
+ sigma_min: 1e-06
92
+ solver: 'euler'
93
+ t_scheduler: 'cosine'
94
+ training_cfg_rate: 0.2
95
+ inference_cfg_rate: 0.7
96
+ reg_loss_type: 'l1'
97
+ estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
98
+ in_channels: 320
99
+ out_channels: 80
100
+ channels: [256, 256]
101
+ dropout: 0
102
+ attention_head_dim: 64
103
+ n_blocks: 4
104
+ num_mid_blocks: 12
105
+ num_heads: 8
106
+ act_fn: 'gelu'
107
+
108
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
109
+ in_channels: 80
110
+ base_channels: 512
111
+ nb_harmonics: 8
112
+ sampling_rate: !ref <sample_rate>
113
+ nsf_alpha: 0.1
114
+ nsf_sigma: 0.003
115
+ nsf_voiced_threshold: 10
116
+ upsample_rates: [8, 8]
117
+ upsample_kernel_sizes: [16, 16]
118
+ istft_params:
119
+ n_fft: 16
120
+ hop_len: 4
121
+ resblock_kernel_sizes: [3, 7, 11]
122
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
123
+ source_resblock_kernel_sizes: [7, 11]
124
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
125
+ lrelu_slope: 0.1
126
+ audio_limit: 0.99
127
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
128
+ num_class: 1
129
+ in_channels: 80
130
+ cond_channels: 512
flow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:450f52ba9cbea63757914e72b42ac5af21509a2767de0b18b4694546fd1d30ed
3
+ size 445171500
hift.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91e679b6ca1eff71187ffb4f3ab0444935594cdcc20a9bd12afad111ef8d6012
3
+ size 81896716