Spaces:
Running
on
Zero
Running
on
Zero
update 25hz yaml
Browse files
examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
CHANGED
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
19 |
llm_input_size: !ref <llm_input_size>
|
20 |
llm_output_size: !ref <llm_output_size>
|
21 |
-
text_token_size: 51866
|
22 |
speech_token_size: 4096
|
23 |
length_normalized_loss: True
|
24 |
lsm_weight: 0
|
@@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
66 |
spk_embed_dim: !ref <spk_embed_dim>
|
67 |
output_type: 'mel'
|
68 |
vocab_size: 4096
|
69 |
-
input_frame_rate: 50
|
70 |
only_mask_loss: True
|
71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
72 |
output_size: 512
|
@@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
135 |
|
136 |
# processor functions
|
137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
138 |
-
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
139 |
multilingual: True
|
140 |
num_languages: 100
|
141 |
language: 'en'
|
|
|
18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
19 |
llm_input_size: !ref <llm_input_size>
|
20 |
llm_output_size: !ref <llm_output_size>
|
21 |
+
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
22 |
speech_token_size: 4096
|
23 |
length_normalized_loss: True
|
24 |
lsm_weight: 0
|
|
|
66 |
spk_embed_dim: !ref <spk_embed_dim>
|
67 |
output_type: 'mel'
|
68 |
vocab_size: 4096
|
69 |
+
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
70 |
only_mask_loss: True
|
71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
72 |
output_size: 512
|
|
|
135 |
|
136 |
# processor functions
|
137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
138 |
+
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
139 |
multilingual: True
|
140 |
num_languages: 100
|
141 |
language: 'en'
|
examples/libritts/cosyvoice/conf/cosyvoice.yaml
CHANGED
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
19 |
llm_input_size: !ref <llm_input_size>
|
20 |
llm_output_size: !ref <llm_output_size>
|
21 |
-
text_token_size: 51866
|
22 |
speech_token_size: 4096
|
23 |
length_normalized_loss: True
|
24 |
lsm_weight: 0
|
@@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
66 |
spk_embed_dim: !ref <spk_embed_dim>
|
67 |
output_type: 'mel'
|
68 |
vocab_size: 4096
|
69 |
-
input_frame_rate: 50
|
70 |
only_mask_loss: True
|
71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
72 |
output_size: 512
|
@@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
135 |
|
136 |
# processor functions
|
137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
138 |
-
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
139 |
multilingual: True
|
140 |
num_languages: 100
|
141 |
language: 'en'
|
|
|
18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
19 |
llm_input_size: !ref <llm_input_size>
|
20 |
llm_output_size: !ref <llm_output_size>
|
21 |
+
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
22 |
speech_token_size: 4096
|
23 |
length_normalized_loss: True
|
24 |
lsm_weight: 0
|
|
|
66 |
spk_embed_dim: !ref <spk_embed_dim>
|
67 |
output_type: 'mel'
|
68 |
vocab_size: 4096
|
69 |
+
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
70 |
only_mask_loss: True
|
71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
72 |
output_size: 512
|
|
|
135 |
|
136 |
# processor functions
|
137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
138 |
+
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
139 |
multilingual: True
|
140 |
num_languages: 100
|
141 |
language: 'en'
|
examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml
CHANGED
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
19 |
llm_input_size: !ref <llm_input_size>
|
20 |
llm_output_size: !ref <llm_output_size>
|
21 |
-
text_token_size: 51866
|
22 |
speech_token_size: 4096
|
23 |
length_normalized_loss: True
|
24 |
lsm_weight: 0
|
@@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
55 |
selfattention_layer_type: 'rel_selfattn'
|
56 |
static_chunk_size: 1
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
59 |
input_size: 512
|
@@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
61 |
spk_embed_dim: !ref <spk_embed_dim>
|
62 |
output_type: 'mel'
|
63 |
vocab_size: 4096
|
64 |
-
input_frame_rate: 50
|
65 |
only_mask_loss: True
|
66 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
67 |
output_size: 512
|
@@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
130 |
|
131 |
# processor functions
|
132 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
133 |
-
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
134 |
multilingual: True
|
135 |
num_languages: 100
|
136 |
language: 'en'
|
|
|
18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
19 |
llm_input_size: !ref <llm_input_size>
|
20 |
llm_output_size: !ref <llm_output_size>
|
21 |
+
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
22 |
speech_token_size: 4096
|
23 |
length_normalized_loss: True
|
24 |
lsm_weight: 0
|
|
|
54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
55 |
selfattention_layer_type: 'rel_selfattn'
|
56 |
static_chunk_size: 1
|
57 |
+
sampling: !name:cosyvoice.utils.common.ras_sampling
|
58 |
+
top_p: 0.8
|
59 |
+
top_k: 25
|
60 |
+
win_size: 10
|
61 |
+
tau_r: 0.1
|
62 |
|
63 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
64 |
input_size: 512
|
|
|
66 |
spk_embed_dim: !ref <spk_embed_dim>
|
67 |
output_type: 'mel'
|
68 |
vocab_size: 4096
|
69 |
+
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
70 |
only_mask_loss: True
|
71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
72 |
output_size: 512
|
|
|
135 |
|
136 |
# processor functions
|
137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
138 |
+
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
139 |
multilingual: True
|
140 |
num_languages: 100
|
141 |
language: 'en'
|
examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml
CHANGED
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
19 |
llm_input_size: !ref <llm_input_size>
|
20 |
llm_output_size: !ref <llm_output_size>
|
21 |
-
text_token_size: 51866
|
22 |
speech_token_size: 4096
|
23 |
length_normalized_loss: True
|
24 |
lsm_weight: 0
|
@@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
55 |
selfattention_layer_type: 'rel_selfattn'
|
56 |
static_chunk_size: 1
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
59 |
input_size: 512
|
@@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
61 |
spk_embed_dim: !ref <spk_embed_dim>
|
62 |
output_type: 'mel'
|
63 |
vocab_size: 4096
|
64 |
-
input_frame_rate: 50
|
65 |
only_mask_loss: True
|
66 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
67 |
output_size: 512
|
@@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
130 |
|
131 |
# processor functions
|
132 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
133 |
-
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
134 |
multilingual: True
|
135 |
num_languages: 100
|
136 |
language: 'en'
|
|
|
18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
19 |
llm_input_size: !ref <llm_input_size>
|
20 |
llm_output_size: !ref <llm_output_size>
|
21 |
+
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
22 |
speech_token_size: 4096
|
23 |
length_normalized_loss: True
|
24 |
lsm_weight: 0
|
|
|
54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
55 |
selfattention_layer_type: 'rel_selfattn'
|
56 |
static_chunk_size: 1
|
57 |
+
sampling: !name:cosyvoice.utils.common.ras_sampling
|
58 |
+
top_p: 0.8
|
59 |
+
top_k: 25
|
60 |
+
win_size: 10
|
61 |
+
tau_r: 0.1
|
62 |
|
63 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
64 |
input_size: 512
|
|
|
66 |
spk_embed_dim: !ref <spk_embed_dim>
|
67 |
output_type: 'mel'
|
68 |
vocab_size: 4096
|
69 |
+
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
70 |
only_mask_loss: True
|
71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
72 |
output_size: 512
|
|
|
135 |
|
136 |
# processor functions
|
137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
138 |
+
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
139 |
multilingual: True
|
140 |
num_languages: 100
|
141 |
language: 'en'
|