Spaces:
Running
on
Zero
Running
on
Zero
Merge pull request #436 from FunAudioLLM/dev/lyuxiang.lx
Browse files- README.md +9 -5
- cosyvoice/cli/cosyvoice.py +13 -4
- cosyvoice/cli/frontend.py +14 -0
- cosyvoice/cli/model.py +64 -12
- cosyvoice/flow/flow.py +2 -3
- cosyvoice/flow/length_regulator.py +5 -4
- cosyvoice/llm/llm.py +1 -1
- cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +0 -0
- cosyvoice/tokenizer/tokenizer.py +236 -0
- cosyvoice/utils/common.py +9 -0
- webui.py +1 -7
README.md
CHANGED
@@ -22,12 +22,9 @@ For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVo
|
|
22 |
- [ ] 25hz cosyvoice base model
|
23 |
- [ ] 25hz cosyvoice voice conversion model
|
24 |
|
25 |
-
- [ ] 2024/10
|
26 |
-
|
27 |
-
- [ ] 50hz llama based llm model which supports lora finetune
|
28 |
-
|
29 |
- [ ] TBD
|
30 |
|
|
|
31 |
- [ ] Support more instruction mode
|
32 |
- [ ] Voice conversion
|
33 |
- [ ] Music generation
|
@@ -74,6 +71,7 @@ If you are expert in this field, and you are only interested in training your ow
|
|
74 |
# SDK模型下载
|
75 |
from modelscope import snapshot_download
|
76 |
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
|
|
|
77 |
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
|
78 |
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
|
79 |
snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
|
@@ -83,6 +81,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice
|
|
83 |
# git模型下载,请确保已安装git lfs
|
84 |
mkdir -p pretrained_models
|
85 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
|
|
|
86 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
|
87 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
|
88 |
git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
@@ -121,7 +120,7 @@ print(cosyvoice.list_avaliable_spks())
|
|
121 |
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
122 |
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
|
123 |
|
124 |
-
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
|
125 |
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
126 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
127 |
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
@@ -130,6 +129,11 @@ for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来
|
|
130 |
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
131 |
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
132 |
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
135 |
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
|
|
22 |
- [ ] 25hz cosyvoice base model
|
23 |
- [ ] 25hz cosyvoice voice conversion model
|
24 |
|
|
|
|
|
|
|
|
|
25 |
- [ ] TBD
|
26 |
|
27 |
+
- [ ] 25hz llama based llm model which supports lora finetune
|
28 |
- [ ] Support more instruction mode
|
29 |
- [ ] Voice conversion
|
30 |
- [ ] Music generation
|
|
|
71 |
# SDK模型下载
|
72 |
from modelscope import snapshot_download
|
73 |
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
|
74 |
+
snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
|
75 |
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
|
76 |
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
|
77 |
snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
|
|
|
81 |
# git模型下载,请确保已安装git lfs
|
82 |
mkdir -p pretrained_models
|
83 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
|
84 |
+
git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
|
85 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
|
86 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
|
87 |
git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
|
|
120 |
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
121 |
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
|
122 |
|
123 |
+
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
|
124 |
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
125 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
126 |
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
|
|
129 |
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
130 |
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
131 |
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
|
132 |
+
# vc usage
|
133 |
+
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
134 |
+
source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
135 |
+
for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
|
136 |
+
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
|
137 |
|
138 |
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
139 |
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
cosyvoice/cli/cosyvoice.py
CHANGED
@@ -58,7 +58,7 @@ class CosyVoice:
|
|
58 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
59 |
start_time = time.time()
|
60 |
logging.info('synthesis text {}'.format(i))
|
61 |
-
for model_output in self.model.
|
62 |
speech_len = model_output['tts_speech'].shape[1] / 22050
|
63 |
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
|
64 |
yield model_output
|
@@ -70,7 +70,7 @@ class CosyVoice:
|
|
70 |
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
|
71 |
start_time = time.time()
|
72 |
logging.info('synthesis text {}'.format(i))
|
73 |
-
for model_output in self.model.
|
74 |
speech_len = model_output['tts_speech'].shape[1] / 22050
|
75 |
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
|
76 |
yield model_output
|
@@ -83,7 +83,7 @@ class CosyVoice:
|
|
83 |
model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
|
84 |
start_time = time.time()
|
85 |
logging.info('synthesis text {}'.format(i))
|
86 |
-
for model_output in self.model.
|
87 |
speech_len = model_output['tts_speech'].shape[1] / 22050
|
88 |
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
|
89 |
yield model_output
|
@@ -97,8 +97,17 @@ class CosyVoice:
|
|
97 |
model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
|
98 |
start_time = time.time()
|
99 |
logging.info('synthesis text {}'.format(i))
|
100 |
-
for model_output in self.model.
|
101 |
speech_len = model_output['tts_speech'].shape[1] / 22050
|
102 |
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
|
103 |
yield model_output
|
104 |
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
59 |
start_time = time.time()
|
60 |
logging.info('synthesis text {}'.format(i))
|
61 |
+
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
|
62 |
speech_len = model_output['tts_speech'].shape[1] / 22050
|
63 |
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
|
64 |
yield model_output
|
|
|
70 |
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
|
71 |
start_time = time.time()
|
72 |
logging.info('synthesis text {}'.format(i))
|
73 |
+
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
|
74 |
speech_len = model_output['tts_speech'].shape[1] / 22050
|
75 |
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
|
76 |
yield model_output
|
|
|
83 |
model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
|
84 |
start_time = time.time()
|
85 |
logging.info('synthesis text {}'.format(i))
|
86 |
+
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
|
87 |
speech_len = model_output['tts_speech'].shape[1] / 22050
|
88 |
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
|
89 |
yield model_output
|
|
|
97 |
model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
|
98 |
start_time = time.time()
|
99 |
logging.info('synthesis text {}'.format(i))
|
100 |
+
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
|
101 |
speech_len = model_output['tts_speech'].shape[1] / 22050
|
102 |
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
|
103 |
yield model_output
|
104 |
start_time = time.time()
|
105 |
+
|
106 |
+
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
|
107 |
+
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k)
|
108 |
+
start_time = time.time()
|
109 |
+
for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
|
110 |
+
speech_len = model_output['tts_speech'].shape[1] / 22050
|
111 |
+
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
|
112 |
+
yield model_output
|
113 |
+
start_time = time.time()
|
cosyvoice/cli/frontend.py
CHANGED
@@ -55,6 +55,8 @@ class CosyVoiceFrontEnd:
|
|
55 |
"CPUExecutionProvider"])
|
56 |
if os.path.exists(spk2info):
|
57 |
self.spk2info = torch.load(spk2info, map_location=self.device)
|
|
|
|
|
58 |
self.instruct = instruct
|
59 |
self.allowed_special = allowed_special
|
60 |
self.inflect_parser = inflect.engine()
|
@@ -172,3 +174,15 @@ class CosyVoiceFrontEnd:
|
|
172 |
model_input['prompt_text'] = instruct_text_token
|
173 |
model_input['prompt_text_len'] = instruct_text_token_len
|
174 |
return model_input
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
"CPUExecutionProvider"])
|
56 |
if os.path.exists(spk2info):
|
57 |
self.spk2info = torch.load(spk2info, map_location=self.device)
|
58 |
+
else:
|
59 |
+
self.spk2info = {}
|
60 |
self.instruct = instruct
|
61 |
self.allowed_special = allowed_special
|
62 |
self.inflect_parser = inflect.engine()
|
|
|
174 |
model_input['prompt_text'] = instruct_text_token
|
175 |
model_input['prompt_text_len'] = instruct_text_token_len
|
176 |
return model_input
|
177 |
+
|
178 |
+
def frontend_vc(self, source_speech_16k, prompt_speech_16k):
|
179 |
+
prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
|
180 |
+
prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
|
181 |
+
prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
|
182 |
+
embedding = self._extract_spk_embedding(prompt_speech_16k)
|
183 |
+
source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
|
184 |
+
model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
|
185 |
+
'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
|
186 |
+
'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
|
187 |
+
'flow_embedding': embedding}
|
188 |
+
return model_input
|
cosyvoice/cli/model.py
CHANGED
@@ -35,7 +35,7 @@ class CosyVoiceModel:
|
|
35 |
self.token_max_hop_len = 200
|
36 |
self.token_overlap_len = 20
|
37 |
# mel fade in out
|
38 |
-
self.mel_overlap_len =
|
39 |
self.mel_window = np.hamming(2 * self.mel_overlap_len)
|
40 |
# hift cache
|
41 |
self.mel_cache_len = 20
|
@@ -63,11 +63,11 @@ class CosyVoiceModel:
|
|
63 |
self.hift.to(self.device).eval()
|
64 |
|
65 |
def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
|
66 |
-
llm_text_encoder = torch.jit.load(llm_text_encoder_model)
|
67 |
self.llm.text_encoder = llm_text_encoder
|
68 |
-
llm_llm = torch.jit.load(llm_llm_model)
|
69 |
self.llm.llm = llm_llm
|
70 |
-
flow_encoder = torch.jit.load(flow_encoder_model)
|
71 |
self.flow.encoder = flow_encoder
|
72 |
|
73 |
def load_onnx(self, flow_decoder_estimator_model):
|
@@ -131,11 +131,11 @@ class CosyVoiceModel:
|
|
131 |
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
|
132 |
return tts_speech
|
133 |
|
134 |
-
def
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
# this_uuid is used to track variables related to this inference thread
|
140 |
this_uuid = str(uuid.uuid1())
|
141 |
with self.lock:
|
@@ -148,7 +148,8 @@ class CosyVoiceModel:
|
|
148 |
while True:
|
149 |
time.sleep(0.1)
|
150 |
if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
|
151 |
-
this_tts_speech_token = torch.
|
|
|
152 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
153 |
prompt_token=flow_prompt_speech_token,
|
154 |
prompt_feat=prompt_speech_feat,
|
@@ -164,7 +165,7 @@ class CosyVoiceModel:
|
|
164 |
break
|
165 |
p.join()
|
166 |
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
167 |
-
this_tts_speech_token = torch.
|
168 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
169 |
prompt_token=flow_prompt_speech_token,
|
170 |
prompt_feat=prompt_speech_feat,
|
@@ -175,7 +176,58 @@ class CosyVoiceModel:
|
|
175 |
else:
|
176 |
# deal with all tokens
|
177 |
p.join()
|
178 |
-
this_tts_speech_token = torch.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
180 |
prompt_token=flow_prompt_speech_token,
|
181 |
prompt_feat=prompt_speech_feat,
|
|
|
35 |
self.token_max_hop_len = 200
|
36 |
self.token_overlap_len = 20
|
37 |
# mel fade in out
|
38 |
+
self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
|
39 |
self.mel_window = np.hamming(2 * self.mel_overlap_len)
|
40 |
# hift cache
|
41 |
self.mel_cache_len = 20
|
|
|
63 |
self.hift.to(self.device).eval()
|
64 |
|
65 |
def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
|
66 |
+
llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
|
67 |
self.llm.text_encoder = llm_text_encoder
|
68 |
+
llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
|
69 |
self.llm.llm = llm_llm
|
70 |
+
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
|
71 |
self.flow.encoder = flow_encoder
|
72 |
|
73 |
def load_onnx(self, flow_decoder_estimator_model):
|
|
|
131 |
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
|
132 |
return tts_speech
|
133 |
|
134 |
+
def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
|
135 |
+
prompt_text=torch.zeros(1, 0, dtype=torch.int32),
|
136 |
+
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
|
137 |
+
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
|
138 |
+
prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
|
139 |
# this_uuid is used to track variables related to this inference thread
|
140 |
this_uuid = str(uuid.uuid1())
|
141 |
with self.lock:
|
|
|
148 |
while True:
|
149 |
time.sleep(0.1)
|
150 |
if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
|
151 |
+
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
|
152 |
+
.unsqueeze(dim=0)
|
153 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
154 |
prompt_token=flow_prompt_speech_token,
|
155 |
prompt_feat=prompt_speech_feat,
|
|
|
165 |
break
|
166 |
p.join()
|
167 |
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
168 |
+
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
169 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
170 |
prompt_token=flow_prompt_speech_token,
|
171 |
prompt_feat=prompt_speech_feat,
|
|
|
176 |
else:
|
177 |
# deal with all tokens
|
178 |
p.join()
|
179 |
+
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
180 |
+
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
181 |
+
prompt_token=flow_prompt_speech_token,
|
182 |
+
prompt_feat=prompt_speech_feat,
|
183 |
+
embedding=flow_embedding,
|
184 |
+
uuid=this_uuid,
|
185 |
+
finalize=True,
|
186 |
+
speed=speed)
|
187 |
+
yield {'tts_speech': this_tts_speech.cpu()}
|
188 |
+
with self.lock:
|
189 |
+
self.tts_speech_token_dict.pop(this_uuid)
|
190 |
+
self.llm_end_dict.pop(this_uuid)
|
191 |
+
self.mel_overlap_dict.pop(this_uuid)
|
192 |
+
self.hift_cache_dict.pop(this_uuid)
|
193 |
+
|
194 |
+
def vc(self, source_speech_token, flow_prompt_speech_token, prompt_speech_feat, flow_embedding, stream=False, speed=1.0, **kwargs):
|
195 |
+
# this_uuid is used to track variables related to this inference thread
|
196 |
+
this_uuid = str(uuid.uuid1())
|
197 |
+
with self.lock:
|
198 |
+
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = source_speech_token.flatten().tolist(), True
|
199 |
+
self.mel_overlap_dict[this_uuid], self.hift_cache_dict[this_uuid] = None, None
|
200 |
+
if stream is True:
|
201 |
+
token_hop_len = self.token_min_hop_len
|
202 |
+
while True:
|
203 |
+
if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
|
204 |
+
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
|
205 |
+
.unsqueeze(dim=0)
|
206 |
+
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
207 |
+
prompt_token=flow_prompt_speech_token,
|
208 |
+
prompt_feat=prompt_speech_feat,
|
209 |
+
embedding=flow_embedding,
|
210 |
+
uuid=this_uuid,
|
211 |
+
finalize=False)
|
212 |
+
yield {'tts_speech': this_tts_speech.cpu()}
|
213 |
+
with self.lock:
|
214 |
+
self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
|
215 |
+
# increase token_hop_len for better speech quality
|
216 |
+
token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
|
217 |
+
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
|
218 |
+
break
|
219 |
+
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
220 |
+
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid], dim=1).unsqueeze(dim=0)
|
221 |
+
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
222 |
+
prompt_token=flow_prompt_speech_token,
|
223 |
+
prompt_feat=prompt_speech_feat,
|
224 |
+
embedding=flow_embedding,
|
225 |
+
uuid=this_uuid,
|
226 |
+
finalize=True)
|
227 |
+
yield {'tts_speech': this_tts_speech.cpu()}
|
228 |
+
else:
|
229 |
+
# deal with all tokens
|
230 |
+
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
231 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
232 |
prompt_token=flow_prompt_speech_token,
|
233 |
prompt_feat=prompt_speech_feat,
|
cosyvoice/flow/flow.py
CHANGED
@@ -124,15 +124,14 @@ class MaskedDiffWithXvec(torch.nn.Module):
|
|
124 |
# text encode
|
125 |
h, h_lengths = self.encoder(token, token_len)
|
126 |
h = self.encoder_proj(h)
|
127 |
-
mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 /
|
128 |
-
h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2)
|
129 |
|
130 |
# get conditions
|
131 |
conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device)
|
132 |
conds[:, :mel_len1] = prompt_feat
|
133 |
conds = conds.transpose(1, 2)
|
134 |
|
135 |
-
# mask = (~make_pad_mask(feat_len)).to(h)
|
136 |
mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
|
137 |
feat = self.decoder(
|
138 |
mu=h.transpose(1, 2).contiguous(),
|
|
|
124 |
# text encode
|
125 |
h, h_lengths = self.encoder(token, token_len)
|
126 |
h = self.encoder_proj(h)
|
127 |
+
mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / self.input_frame_rate * 22050 / 256)
|
128 |
+
h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2, self.input_frame_rate)
|
129 |
|
130 |
# get conditions
|
131 |
conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device)
|
132 |
conds[:, :mel_len1] = prompt_feat
|
133 |
conds = conds.transpose(1, 2)
|
134 |
|
|
|
135 |
mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
|
136 |
feat = self.decoder(
|
137 |
mu=h.transpose(1, 2).contiguous(),
|
cosyvoice/flow/length_regulator.py
CHANGED
@@ -49,13 +49,14 @@ class InterpolateRegulator(nn.Module):
|
|
49 |
olens = ylens
|
50 |
return out * mask, olens
|
51 |
|
52 |
-
def inference(self, x1, x2, mel_len1, mel_len2):
|
53 |
# in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
|
54 |
# x in (B, T, D)
|
55 |
if x2.shape[1] > 40:
|
56 |
-
x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=
|
57 |
-
x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 -
|
58 |
-
|
|
|
59 |
x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
|
60 |
else:
|
61 |
x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
|
|
|
49 |
olens = ylens
|
50 |
return out * mask, olens
|
51 |
|
52 |
+
def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
|
53 |
# in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
|
54 |
# x in (B, T, D)
|
55 |
if x2.shape[1] > 40:
|
56 |
+
x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
|
57 |
+
x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
|
58 |
+
mode='linear')
|
59 |
+
x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
|
60 |
x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
|
61 |
else:
|
62 |
x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
|
cosyvoice/llm/llm.py
CHANGED
@@ -206,7 +206,7 @@ class TransformerLM(torch.nn.Module):
|
|
206 |
if top_ids == self.speech_token_size:
|
207 |
break
|
208 |
# in stream mode, yield token one by one
|
209 |
-
yield
|
210 |
out_tokens.append(top_ids)
|
211 |
offset += lm_input.size(1)
|
212 |
lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
|
|
|
206 |
if top_ids == self.speech_token_size:
|
207 |
break
|
208 |
# in stream mode, yield token one by one
|
209 |
+
yield top_ids
|
210 |
out_tokens.append(top_ids)
|
211 |
offset += lm_input.size(1)
|
212 |
lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
|
cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken
ADDED
The diff for this file is too large to render.
See raw diff
|
|
cosyvoice/tokenizer/tokenizer.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import os
|
3 |
+
from functools import lru_cache
|
4 |
+
from typing import Optional
|
5 |
+
from whisper.tokenizer import Tokenizer
|
6 |
+
|
7 |
+
import tiktoken
|
8 |
+
|
9 |
+
LANGUAGES = {
|
10 |
+
"en": "english",
|
11 |
+
"zh": "chinese",
|
12 |
+
"de": "german",
|
13 |
+
"es": "spanish",
|
14 |
+
"ru": "russian",
|
15 |
+
"ko": "korean",
|
16 |
+
"fr": "french",
|
17 |
+
"ja": "japanese",
|
18 |
+
"pt": "portuguese",
|
19 |
+
"tr": "turkish",
|
20 |
+
"pl": "polish",
|
21 |
+
"ca": "catalan",
|
22 |
+
"nl": "dutch",
|
23 |
+
"ar": "arabic",
|
24 |
+
"sv": "swedish",
|
25 |
+
"it": "italian",
|
26 |
+
"id": "indonesian",
|
27 |
+
"hi": "hindi",
|
28 |
+
"fi": "finnish",
|
29 |
+
"vi": "vietnamese",
|
30 |
+
"he": "hebrew",
|
31 |
+
"uk": "ukrainian",
|
32 |
+
"el": "greek",
|
33 |
+
"ms": "malay",
|
34 |
+
"cs": "czech",
|
35 |
+
"ro": "romanian",
|
36 |
+
"da": "danish",
|
37 |
+
"hu": "hungarian",
|
38 |
+
"ta": "tamil",
|
39 |
+
"no": "norwegian",
|
40 |
+
"th": "thai",
|
41 |
+
"ur": "urdu",
|
42 |
+
"hr": "croatian",
|
43 |
+
"bg": "bulgarian",
|
44 |
+
"lt": "lithuanian",
|
45 |
+
"la": "latin",
|
46 |
+
"mi": "maori",
|
47 |
+
"ml": "malayalam",
|
48 |
+
"cy": "welsh",
|
49 |
+
"sk": "slovak",
|
50 |
+
"te": "telugu",
|
51 |
+
"fa": "persian",
|
52 |
+
"lv": "latvian",
|
53 |
+
"bn": "bengali",
|
54 |
+
"sr": "serbian",
|
55 |
+
"az": "azerbaijani",
|
56 |
+
"sl": "slovenian",
|
57 |
+
"kn": "kannada",
|
58 |
+
"et": "estonian",
|
59 |
+
"mk": "macedonian",
|
60 |
+
"br": "breton",
|
61 |
+
"eu": "basque",
|
62 |
+
"is": "icelandic",
|
63 |
+
"hy": "armenian",
|
64 |
+
"ne": "nepali",
|
65 |
+
"mn": "mongolian",
|
66 |
+
"bs": "bosnian",
|
67 |
+
"kk": "kazakh",
|
68 |
+
"sq": "albanian",
|
69 |
+
"sw": "swahili",
|
70 |
+
"gl": "galician",
|
71 |
+
"mr": "marathi",
|
72 |
+
"pa": "punjabi",
|
73 |
+
"si": "sinhala",
|
74 |
+
"km": "khmer",
|
75 |
+
"sn": "shona",
|
76 |
+
"yo": "yoruba",
|
77 |
+
"so": "somali",
|
78 |
+
"af": "afrikaans",
|
79 |
+
"oc": "occitan",
|
80 |
+
"ka": "georgian",
|
81 |
+
"be": "belarusian",
|
82 |
+
"tg": "tajik",
|
83 |
+
"sd": "sindhi",
|
84 |
+
"gu": "gujarati",
|
85 |
+
"am": "amharic",
|
86 |
+
"yi": "yiddish",
|
87 |
+
"lo": "lao",
|
88 |
+
"uz": "uzbek",
|
89 |
+
"fo": "faroese",
|
90 |
+
"ht": "haitian creole",
|
91 |
+
"ps": "pashto",
|
92 |
+
"tk": "turkmen",
|
93 |
+
"nn": "nynorsk",
|
94 |
+
"mt": "maltese",
|
95 |
+
"sa": "sanskrit",
|
96 |
+
"lb": "luxembourgish",
|
97 |
+
"my": "myanmar",
|
98 |
+
"bo": "tibetan",
|
99 |
+
"tl": "tagalog",
|
100 |
+
"mg": "malagasy",
|
101 |
+
"as": "assamese",
|
102 |
+
"tt": "tatar",
|
103 |
+
"haw": "hawaiian",
|
104 |
+
"ln": "lingala",
|
105 |
+
"ha": "hausa",
|
106 |
+
"ba": "bashkir",
|
107 |
+
"jw": "javanese",
|
108 |
+
"su": "sundanese",
|
109 |
+
"yue": "cantonese",
|
110 |
+
"minnan": "minnan",
|
111 |
+
"wuyu": "wuyu",
|
112 |
+
"dialect": "dialect",
|
113 |
+
"zh/en": "zh/en",
|
114 |
+
"en/zh": "en/zh",
|
115 |
+
}
|
116 |
+
|
117 |
+
# language code lookup by name, with a few language aliases
|
118 |
+
TO_LANGUAGE_CODE = {
|
119 |
+
**{language: code for code, language in LANGUAGES.items()},
|
120 |
+
"burmese": "my",
|
121 |
+
"valencian": "ca",
|
122 |
+
"flemish": "nl",
|
123 |
+
"haitian": "ht",
|
124 |
+
"letzeburgesch": "lb",
|
125 |
+
"pushto": "ps",
|
126 |
+
"panjabi": "pa",
|
127 |
+
"moldavian": "ro",
|
128 |
+
"moldovan": "ro",
|
129 |
+
"sinhalese": "si",
|
130 |
+
"castilian": "es",
|
131 |
+
"mandarin": "zh",
|
132 |
+
}
|
133 |
+
|
134 |
+
AUDIO_EVENT = {
|
135 |
+
"ASR": "ASR",
|
136 |
+
"AED": "AED",
|
137 |
+
"SER": "SER",
|
138 |
+
"Speech": "Speech",
|
139 |
+
"/Speech": "/Speech",
|
140 |
+
"BGM": "BGM",
|
141 |
+
"/BGM": "/BGM",
|
142 |
+
"Laughter": "Laughter",
|
143 |
+
"/Laughter": "/Laughter",
|
144 |
+
"Applause": "Applause",
|
145 |
+
"/Applause": "/Applause",
|
146 |
+
}
|
147 |
+
|
148 |
+
EMOTION = {
|
149 |
+
"HAPPY": "HAPPY",
|
150 |
+
"SAD": "SAD",
|
151 |
+
"ANGRY": "ANGRY",
|
152 |
+
"NEUTRAL": "NEUTRAL",
|
153 |
+
}
|
154 |
+
|
155 |
+
TTS_Vocal_Token = {
|
156 |
+
"TTS/B": "TTS/B",
|
157 |
+
"TTS/O": "TTS/O",
|
158 |
+
"TTS/Q": "TTS/Q",
|
159 |
+
"TTS/A": "TTS/A",
|
160 |
+
"TTS/CO": "TTS/CO",
|
161 |
+
"TTS/CL": "TTS/CL",
|
162 |
+
"TTS/H": "TTS/H",
|
163 |
+
**{f"TTS/SP{i:02d}": f"TTS/SP{i:02d}" for i in range(1, 14)}
|
164 |
+
}
|
165 |
+
|
166 |
+
|
167 |
+
@lru_cache(maxsize=None)
|
168 |
+
def get_encoding(name: str = "gpt2", num_languages: int = 99):
|
169 |
+
vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
|
170 |
+
ranks = {
|
171 |
+
base64.b64decode(token): int(rank)
|
172 |
+
for token, rank in (line.split() for line in open(vocab_path) if line)
|
173 |
+
}
|
174 |
+
n_vocab = len(ranks)
|
175 |
+
special_tokens = {}
|
176 |
+
|
177 |
+
specials = [
|
178 |
+
"<|endoftext|>",
|
179 |
+
"<|startoftranscript|>",
|
180 |
+
*[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
|
181 |
+
*[f"<|{audio_event}|>" for audio_event in list(AUDIO_EVENT.keys())],
|
182 |
+
*[f"<|{emotion}|>" for emotion in list(EMOTION.keys())],
|
183 |
+
"<|translate|>",
|
184 |
+
"<|transcribe|>",
|
185 |
+
"<|startoflm|>",
|
186 |
+
"<|startofprev|>",
|
187 |
+
"<|nospeech|>",
|
188 |
+
"<|notimestamps|>",
|
189 |
+
*[f"<|SPECIAL_TOKEN_{i}|>" for i in range(1, 31)], # register special tokens for ASR
|
190 |
+
*[f"<|{tts}|>" for tts in list(TTS_Vocal_Token.keys())], # register special tokens for TTS
|
191 |
+
*[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
|
192 |
+
]
|
193 |
+
|
194 |
+
for token in specials:
|
195 |
+
special_tokens[token] = n_vocab
|
196 |
+
n_vocab += 1
|
197 |
+
|
198 |
+
return tiktoken.Encoding(
|
199 |
+
name=os.path.basename(vocab_path),
|
200 |
+
explicit_n_vocab=n_vocab,
|
201 |
+
pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
202 |
+
mergeable_ranks=ranks,
|
203 |
+
special_tokens=special_tokens,
|
204 |
+
)
|
205 |
+
|
206 |
+
|
207 |
+
@lru_cache(maxsize=None)
|
208 |
+
def get_tokenizer(
|
209 |
+
multilingual: bool,
|
210 |
+
*,
|
211 |
+
num_languages: int = 99,
|
212 |
+
language: Optional[str] = None,
|
213 |
+
task: Optional[str] = None, # Literal["transcribe", "translate", None]
|
214 |
+
) -> Tokenizer:
|
215 |
+
if language is not None:
|
216 |
+
language = language.lower()
|
217 |
+
if language not in LANGUAGES:
|
218 |
+
if language in TO_LANGUAGE_CODE:
|
219 |
+
language = TO_LANGUAGE_CODE[language]
|
220 |
+
else:
|
221 |
+
raise ValueError(f"Unsupported language: {language}")
|
222 |
+
|
223 |
+
if multilingual:
|
224 |
+
encoding_name = "multilingual_zh_ja_yue_char_del"
|
225 |
+
language = language or "en"
|
226 |
+
task = task or "transcribe"
|
227 |
+
else:
|
228 |
+
encoding_name = "gpt2"
|
229 |
+
language = None
|
230 |
+
task = None
|
231 |
+
|
232 |
+
encoding = get_encoding(name=encoding_name, num_languages=num_languages)
|
233 |
+
|
234 |
+
return Tokenizer(
|
235 |
+
encoding=encoding, num_languages=num_languages, language=language, task=task
|
236 |
+
)
|
cosyvoice/utils/common.py
CHANGED
@@ -15,8 +15,10 @@
|
|
15 |
# Modified from ESPnet(https://github.com/espnet/espnet)
|
16 |
"""Unility functions for Transformer."""
|
17 |
|
|
|
18 |
from typing import List
|
19 |
|
|
|
20 |
import torch
|
21 |
|
22 |
IGNORE_ID = -1
|
@@ -142,3 +144,10 @@ def fade_in_out(fade_in_mel, fade_out_mel, window):
|
|
142 |
fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
|
143 |
fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
|
144 |
return fade_in_mel.to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
# Modified from ESPnet(https://github.com/espnet/espnet)
|
16 |
"""Unility functions for Transformer."""
|
17 |
|
18 |
+
import random
|
19 |
from typing import List
|
20 |
|
21 |
+
import numpy as np
|
22 |
import torch
|
23 |
|
24 |
IGNORE_ID = -1
|
|
|
144 |
fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
|
145 |
fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
|
146 |
return fade_in_mel.to(device)
|
147 |
+
|
148 |
+
|
149 |
+
def set_all_random_seed(seed):
|
150 |
+
random.seed(seed)
|
151 |
+
np.random.seed(seed)
|
152 |
+
torch.manual_seed(seed)
|
153 |
+
torch.cuda.manual_seed_all(seed)
|
webui.py
CHANGED
@@ -24,6 +24,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
24 |
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
|
25 |
from cosyvoice.cli.cosyvoice import CosyVoice
|
26 |
from cosyvoice.utils.file_utils import load_wav, logging
|
|
|
27 |
|
28 |
inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
|
29 |
instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
|
@@ -42,13 +43,6 @@ def generate_seed():
|
|
42 |
}
|
43 |
|
44 |
|
45 |
-
def set_all_random_seed(seed):
|
46 |
-
random.seed(seed)
|
47 |
-
np.random.seed(seed)
|
48 |
-
torch.manual_seed(seed)
|
49 |
-
torch.cuda.manual_seed_all(seed)
|
50 |
-
|
51 |
-
|
52 |
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
|
53 |
speech, _ = librosa.effects.trim(
|
54 |
speech, top_db=top_db,
|
|
|
24 |
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
|
25 |
from cosyvoice.cli.cosyvoice import CosyVoice
|
26 |
from cosyvoice.utils.file_utils import load_wav, logging
|
27 |
+
from cosyvoice.utils.common import set_all_random_seed
|
28 |
|
29 |
inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
|
30 |
instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
|
|
|
43 |
}
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
|
47 |
speech, _ = librosa.effects.trim(
|
48 |
speech, top_db=top_db,
|