CosyVoice commited on
Commit
6920200
2 Parent(s): f6d44af ba3d969

Merge pull request #436 from FunAudioLLM/dev/lyuxiang.lx

Browse files
README.md CHANGED
@@ -22,12 +22,9 @@ For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVo
22
  - [ ] 25hz cosyvoice base model
23
  - [ ] 25hz cosyvoice voice conversion model
24
 
25
- - [ ] 2024/10
26
-
27
- - [ ] 50hz llama based llm model which supports lora finetune
28
-
29
  - [ ] TBD
30
 
 
31
  - [ ] Support more instruction mode
32
  - [ ] Voice conversion
33
  - [ ] Music generation
@@ -74,6 +71,7 @@ If you are expert in this field, and you are only interested in training your ow
74
  # SDK模型下载
75
  from modelscope import snapshot_download
76
  snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
 
77
  snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
78
  snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
79
  snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
@@ -83,6 +81,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice
83
  # git模型下载,请确保已安装git lfs
84
  mkdir -p pretrained_models
85
  git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
 
86
  git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
87
  git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
88
  git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
@@ -121,7 +120,7 @@ print(cosyvoice.list_avaliable_spks())
121
  for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
122
  torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
123
 
124
- cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
125
  # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
126
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
127
  for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
@@ -130,6 +129,11 @@ for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来
130
  prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
131
  for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
132
  torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
 
 
 
 
 
133
 
134
  cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
135
  # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
 
22
  - [ ] 25hz cosyvoice base model
23
  - [ ] 25hz cosyvoice voice conversion model
24
 
 
 
 
 
25
  - [ ] TBD
26
 
27
+ - [ ] 25hz llama based llm model which supports lora finetune
28
  - [ ] Support more instruction mode
29
  - [ ] Voice conversion
30
  - [ ] Music generation
 
71
  # SDK模型下载
72
  from modelscope import snapshot_download
73
  snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
74
+ snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
75
  snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
76
  snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
77
  snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
 
81
  # git模型下载,请确保已安装git lfs
82
  mkdir -p pretrained_models
83
  git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
84
+ git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
85
  git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
86
  git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
87
  git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
 
120
  for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
121
  torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
122
 
123
+ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
124
  # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
125
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
126
  for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
 
129
  prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
130
  for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
131
  torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
132
+ # vc usage
133
+ prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
134
+ source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
135
+ for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
136
+ torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
137
 
138
  cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
139
  # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
cosyvoice/cli/cosyvoice.py CHANGED
@@ -58,7 +58,7 @@ class CosyVoice:
58
  model_input = self.frontend.frontend_sft(i, spk_id)
59
  start_time = time.time()
60
  logging.info('synthesis text {}'.format(i))
61
- for model_output in self.model.inference(**model_input, stream=stream, speed=speed):
62
  speech_len = model_output['tts_speech'].shape[1] / 22050
63
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
64
  yield model_output
@@ -70,7 +70,7 @@ class CosyVoice:
70
  model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
71
  start_time = time.time()
72
  logging.info('synthesis text {}'.format(i))
73
- for model_output in self.model.inference(**model_input, stream=stream, speed=speed):
74
  speech_len = model_output['tts_speech'].shape[1] / 22050
75
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
76
  yield model_output
@@ -83,7 +83,7 @@ class CosyVoice:
83
  model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
84
  start_time = time.time()
85
  logging.info('synthesis text {}'.format(i))
86
- for model_output in self.model.inference(**model_input, stream=stream, speed=speed):
87
  speech_len = model_output['tts_speech'].shape[1] / 22050
88
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
89
  yield model_output
@@ -97,8 +97,17 @@ class CosyVoice:
97
  model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
98
  start_time = time.time()
99
  logging.info('synthesis text {}'.format(i))
100
- for model_output in self.model.inference(**model_input, stream=stream, speed=speed):
101
  speech_len = model_output['tts_speech'].shape[1] / 22050
102
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
103
  yield model_output
104
  start_time = time.time()
 
 
 
 
 
 
 
 
 
 
58
  model_input = self.frontend.frontend_sft(i, spk_id)
59
  start_time = time.time()
60
  logging.info('synthesis text {}'.format(i))
61
+ for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
62
  speech_len = model_output['tts_speech'].shape[1] / 22050
63
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
64
  yield model_output
 
70
  model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
71
  start_time = time.time()
72
  logging.info('synthesis text {}'.format(i))
73
+ for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
74
  speech_len = model_output['tts_speech'].shape[1] / 22050
75
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
76
  yield model_output
 
83
  model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
84
  start_time = time.time()
85
  logging.info('synthesis text {}'.format(i))
86
+ for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
87
  speech_len = model_output['tts_speech'].shape[1] / 22050
88
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
89
  yield model_output
 
97
  model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
98
  start_time = time.time()
99
  logging.info('synthesis text {}'.format(i))
100
+ for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
101
  speech_len = model_output['tts_speech'].shape[1] / 22050
102
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
103
  yield model_output
104
  start_time = time.time()
105
+
106
+ def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
107
+ model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k)
108
+ start_time = time.time()
109
+ for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
110
+ speech_len = model_output['tts_speech'].shape[1] / 22050
111
+ logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
112
+ yield model_output
113
+ start_time = time.time()
cosyvoice/cli/frontend.py CHANGED
@@ -55,6 +55,8 @@ class CosyVoiceFrontEnd:
55
  "CPUExecutionProvider"])
56
  if os.path.exists(spk2info):
57
  self.spk2info = torch.load(spk2info, map_location=self.device)
 
 
58
  self.instruct = instruct
59
  self.allowed_special = allowed_special
60
  self.inflect_parser = inflect.engine()
@@ -172,3 +174,15 @@ class CosyVoiceFrontEnd:
172
  model_input['prompt_text'] = instruct_text_token
173
  model_input['prompt_text_len'] = instruct_text_token_len
174
  return model_input
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  "CPUExecutionProvider"])
56
  if os.path.exists(spk2info):
57
  self.spk2info = torch.load(spk2info, map_location=self.device)
58
+ else:
59
+ self.spk2info = {}
60
  self.instruct = instruct
61
  self.allowed_special = allowed_special
62
  self.inflect_parser = inflect.engine()
 
174
  model_input['prompt_text'] = instruct_text_token
175
  model_input['prompt_text_len'] = instruct_text_token_len
176
  return model_input
177
+
178
+ def frontend_vc(self, source_speech_16k, prompt_speech_16k):
179
+ prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
180
+ prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
181
+ prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
182
+ embedding = self._extract_spk_embedding(prompt_speech_16k)
183
+ source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
184
+ model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
185
+ 'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
186
+ 'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
187
+ 'flow_embedding': embedding}
188
+ return model_input
cosyvoice/cli/model.py CHANGED
@@ -35,7 +35,7 @@ class CosyVoiceModel:
35
  self.token_max_hop_len = 200
36
  self.token_overlap_len = 20
37
  # mel fade in out
38
- self.mel_overlap_len = 34
39
  self.mel_window = np.hamming(2 * self.mel_overlap_len)
40
  # hift cache
41
  self.mel_cache_len = 20
@@ -63,11 +63,11 @@ class CosyVoiceModel:
63
  self.hift.to(self.device).eval()
64
 
65
  def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
66
- llm_text_encoder = torch.jit.load(llm_text_encoder_model)
67
  self.llm.text_encoder = llm_text_encoder
68
- llm_llm = torch.jit.load(llm_llm_model)
69
  self.llm.llm = llm_llm
70
- flow_encoder = torch.jit.load(flow_encoder_model)
71
  self.flow.encoder = flow_encoder
72
 
73
  def load_onnx(self, flow_decoder_estimator_model):
@@ -131,11 +131,11 @@ class CosyVoiceModel:
131
  tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
132
  return tts_speech
133
 
134
- def inference(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
135
- prompt_text=torch.zeros(1, 0, dtype=torch.int32),
136
- llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
137
- flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
138
- prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
139
  # this_uuid is used to track variables related to this inference thread
140
  this_uuid = str(uuid.uuid1())
141
  with self.lock:
@@ -148,7 +148,8 @@ class CosyVoiceModel:
148
  while True:
149
  time.sleep(0.1)
150
  if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
151
- this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len], dim=1)
 
152
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
153
  prompt_token=flow_prompt_speech_token,
154
  prompt_feat=prompt_speech_feat,
@@ -164,7 +165,7 @@ class CosyVoiceModel:
164
  break
165
  p.join()
166
  # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
167
- this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
168
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
169
  prompt_token=flow_prompt_speech_token,
170
  prompt_feat=prompt_speech_feat,
@@ -175,7 +176,58 @@ class CosyVoiceModel:
175
  else:
176
  # deal with all tokens
177
  p.join()
178
- this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
180
  prompt_token=flow_prompt_speech_token,
181
  prompt_feat=prompt_speech_feat,
 
35
  self.token_max_hop_len = 200
36
  self.token_overlap_len = 20
37
  # mel fade in out
38
+ self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
39
  self.mel_window = np.hamming(2 * self.mel_overlap_len)
40
  # hift cache
41
  self.mel_cache_len = 20
 
63
  self.hift.to(self.device).eval()
64
 
65
  def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
66
+ llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
67
  self.llm.text_encoder = llm_text_encoder
68
+ llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
69
  self.llm.llm = llm_llm
70
+ flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
71
  self.flow.encoder = flow_encoder
72
 
73
  def load_onnx(self, flow_decoder_estimator_model):
 
131
  tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
132
  return tts_speech
133
 
134
+ def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
135
+ prompt_text=torch.zeros(1, 0, dtype=torch.int32),
136
+ llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
137
+ flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
138
+ prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
139
  # this_uuid is used to track variables related to this inference thread
140
  this_uuid = str(uuid.uuid1())
141
  with self.lock:
 
148
  while True:
149
  time.sleep(0.1)
150
  if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
151
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
152
+ .unsqueeze(dim=0)
153
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
154
  prompt_token=flow_prompt_speech_token,
155
  prompt_feat=prompt_speech_feat,
 
165
  break
166
  p.join()
167
  # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
168
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
169
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
170
  prompt_token=flow_prompt_speech_token,
171
  prompt_feat=prompt_speech_feat,
 
176
  else:
177
  # deal with all tokens
178
  p.join()
179
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
180
+ this_tts_speech = self.token2wav(token=this_tts_speech_token,
181
+ prompt_token=flow_prompt_speech_token,
182
+ prompt_feat=prompt_speech_feat,
183
+ embedding=flow_embedding,
184
+ uuid=this_uuid,
185
+ finalize=True,
186
+ speed=speed)
187
+ yield {'tts_speech': this_tts_speech.cpu()}
188
+ with self.lock:
189
+ self.tts_speech_token_dict.pop(this_uuid)
190
+ self.llm_end_dict.pop(this_uuid)
191
+ self.mel_overlap_dict.pop(this_uuid)
192
+ self.hift_cache_dict.pop(this_uuid)
193
+
194
+ def vc(self, source_speech_token, flow_prompt_speech_token, prompt_speech_feat, flow_embedding, stream=False, speed=1.0, **kwargs):
195
+ # this_uuid is used to track variables related to this inference thread
196
+ this_uuid = str(uuid.uuid1())
197
+ with self.lock:
198
+ self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = source_speech_token.flatten().tolist(), True
199
+ self.mel_overlap_dict[this_uuid], self.hift_cache_dict[this_uuid] = None, None
200
+ if stream is True:
201
+ token_hop_len = self.token_min_hop_len
202
+ while True:
203
+ if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
204
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
205
+ .unsqueeze(dim=0)
206
+ this_tts_speech = self.token2wav(token=this_tts_speech_token,
207
+ prompt_token=flow_prompt_speech_token,
208
+ prompt_feat=prompt_speech_feat,
209
+ embedding=flow_embedding,
210
+ uuid=this_uuid,
211
+ finalize=False)
212
+ yield {'tts_speech': this_tts_speech.cpu()}
213
+ with self.lock:
214
+ self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
215
+ # increase token_hop_len for better speech quality
216
+ token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
217
+ if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
218
+ break
219
+ # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
220
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid], dim=1).unsqueeze(dim=0)
221
+ this_tts_speech = self.token2wav(token=this_tts_speech_token,
222
+ prompt_token=flow_prompt_speech_token,
223
+ prompt_feat=prompt_speech_feat,
224
+ embedding=flow_embedding,
225
+ uuid=this_uuid,
226
+ finalize=True)
227
+ yield {'tts_speech': this_tts_speech.cpu()}
228
+ else:
229
+ # deal with all tokens
230
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
231
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
232
  prompt_token=flow_prompt_speech_token,
233
  prompt_feat=prompt_speech_feat,
cosyvoice/flow/flow.py CHANGED
@@ -124,15 +124,14 @@ class MaskedDiffWithXvec(torch.nn.Module):
124
  # text encode
125
  h, h_lengths = self.encoder(token, token_len)
126
  h = self.encoder_proj(h)
127
- mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / 50 * 22050 / 256)
128
- h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2)
129
 
130
  # get conditions
131
  conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device)
132
  conds[:, :mel_len1] = prompt_feat
133
  conds = conds.transpose(1, 2)
134
 
135
- # mask = (~make_pad_mask(feat_len)).to(h)
136
  mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
137
  feat = self.decoder(
138
  mu=h.transpose(1, 2).contiguous(),
 
124
  # text encode
125
  h, h_lengths = self.encoder(token, token_len)
126
  h = self.encoder_proj(h)
127
+ mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / self.input_frame_rate * 22050 / 256)
128
+ h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2, self.input_frame_rate)
129
 
130
  # get conditions
131
  conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device)
132
  conds[:, :mel_len1] = prompt_feat
133
  conds = conds.transpose(1, 2)
134
 
 
135
  mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
136
  feat = self.decoder(
137
  mu=h.transpose(1, 2).contiguous(),
cosyvoice/flow/length_regulator.py CHANGED
@@ -49,13 +49,14 @@ class InterpolateRegulator(nn.Module):
49
  olens = ylens
50
  return out * mask, olens
51
 
52
- def inference(self, x1, x2, mel_len1, mel_len2):
53
  # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
54
  # x in (B, T, D)
55
  if x2.shape[1] > 40:
56
- x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=34, mode='linear')
57
- x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - 34 * 2, mode='linear')
58
- x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=34, mode='linear')
 
59
  x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
60
  else:
61
  x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
 
49
  olens = ylens
50
  return out * mask, olens
51
 
52
+ def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
53
  # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
54
  # x in (B, T, D)
55
  if x2.shape[1] > 40:
56
+ x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
57
+ x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
58
+ mode='linear')
59
+ x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
60
  x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
61
  else:
62
  x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
cosyvoice/llm/llm.py CHANGED
@@ -206,7 +206,7 @@ class TransformerLM(torch.nn.Module):
206
  if top_ids == self.speech_token_size:
207
  break
208
  # in stream mode, yield token one by one
209
- yield torch.tensor([[top_ids]], dtype=torch.int64, device=device)
210
  out_tokens.append(top_ids)
211
  offset += lm_input.size(1)
212
  lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
 
206
  if top_ids == self.speech_token_size:
207
  break
208
  # in stream mode, yield token one by one
209
+ yield top_ids
210
  out_tokens.append(top_ids)
211
  offset += lm_input.size(1)
212
  lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
cosyvoice/tokenizer/tokenizer.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ from functools import lru_cache
4
+ from typing import Optional
5
+ from whisper.tokenizer import Tokenizer
6
+
7
+ import tiktoken
8
+
9
+ LANGUAGES = {
10
+ "en": "english",
11
+ "zh": "chinese",
12
+ "de": "german",
13
+ "es": "spanish",
14
+ "ru": "russian",
15
+ "ko": "korean",
16
+ "fr": "french",
17
+ "ja": "japanese",
18
+ "pt": "portuguese",
19
+ "tr": "turkish",
20
+ "pl": "polish",
21
+ "ca": "catalan",
22
+ "nl": "dutch",
23
+ "ar": "arabic",
24
+ "sv": "swedish",
25
+ "it": "italian",
26
+ "id": "indonesian",
27
+ "hi": "hindi",
28
+ "fi": "finnish",
29
+ "vi": "vietnamese",
30
+ "he": "hebrew",
31
+ "uk": "ukrainian",
32
+ "el": "greek",
33
+ "ms": "malay",
34
+ "cs": "czech",
35
+ "ro": "romanian",
36
+ "da": "danish",
37
+ "hu": "hungarian",
38
+ "ta": "tamil",
39
+ "no": "norwegian",
40
+ "th": "thai",
41
+ "ur": "urdu",
42
+ "hr": "croatian",
43
+ "bg": "bulgarian",
44
+ "lt": "lithuanian",
45
+ "la": "latin",
46
+ "mi": "maori",
47
+ "ml": "malayalam",
48
+ "cy": "welsh",
49
+ "sk": "slovak",
50
+ "te": "telugu",
51
+ "fa": "persian",
52
+ "lv": "latvian",
53
+ "bn": "bengali",
54
+ "sr": "serbian",
55
+ "az": "azerbaijani",
56
+ "sl": "slovenian",
57
+ "kn": "kannada",
58
+ "et": "estonian",
59
+ "mk": "macedonian",
60
+ "br": "breton",
61
+ "eu": "basque",
62
+ "is": "icelandic",
63
+ "hy": "armenian",
64
+ "ne": "nepali",
65
+ "mn": "mongolian",
66
+ "bs": "bosnian",
67
+ "kk": "kazakh",
68
+ "sq": "albanian",
69
+ "sw": "swahili",
70
+ "gl": "galician",
71
+ "mr": "marathi",
72
+ "pa": "punjabi",
73
+ "si": "sinhala",
74
+ "km": "khmer",
75
+ "sn": "shona",
76
+ "yo": "yoruba",
77
+ "so": "somali",
78
+ "af": "afrikaans",
79
+ "oc": "occitan",
80
+ "ka": "georgian",
81
+ "be": "belarusian",
82
+ "tg": "tajik",
83
+ "sd": "sindhi",
84
+ "gu": "gujarati",
85
+ "am": "amharic",
86
+ "yi": "yiddish",
87
+ "lo": "lao",
88
+ "uz": "uzbek",
89
+ "fo": "faroese",
90
+ "ht": "haitian creole",
91
+ "ps": "pashto",
92
+ "tk": "turkmen",
93
+ "nn": "nynorsk",
94
+ "mt": "maltese",
95
+ "sa": "sanskrit",
96
+ "lb": "luxembourgish",
97
+ "my": "myanmar",
98
+ "bo": "tibetan",
99
+ "tl": "tagalog",
100
+ "mg": "malagasy",
101
+ "as": "assamese",
102
+ "tt": "tatar",
103
+ "haw": "hawaiian",
104
+ "ln": "lingala",
105
+ "ha": "hausa",
106
+ "ba": "bashkir",
107
+ "jw": "javanese",
108
+ "su": "sundanese",
109
+ "yue": "cantonese",
110
+ "minnan": "minnan",
111
+ "wuyu": "wuyu",
112
+ "dialect": "dialect",
113
+ "zh/en": "zh/en",
114
+ "en/zh": "en/zh",
115
+ }
116
+
117
+ # language code lookup by name, with a few language aliases
118
+ TO_LANGUAGE_CODE = {
119
+ **{language: code for code, language in LANGUAGES.items()},
120
+ "burmese": "my",
121
+ "valencian": "ca",
122
+ "flemish": "nl",
123
+ "haitian": "ht",
124
+ "letzeburgesch": "lb",
125
+ "pushto": "ps",
126
+ "panjabi": "pa",
127
+ "moldavian": "ro",
128
+ "moldovan": "ro",
129
+ "sinhalese": "si",
130
+ "castilian": "es",
131
+ "mandarin": "zh",
132
+ }
133
+
134
+ AUDIO_EVENT = {
135
+ "ASR": "ASR",
136
+ "AED": "AED",
137
+ "SER": "SER",
138
+ "Speech": "Speech",
139
+ "/Speech": "/Speech",
140
+ "BGM": "BGM",
141
+ "/BGM": "/BGM",
142
+ "Laughter": "Laughter",
143
+ "/Laughter": "/Laughter",
144
+ "Applause": "Applause",
145
+ "/Applause": "/Applause",
146
+ }
147
+
148
+ EMOTION = {
149
+ "HAPPY": "HAPPY",
150
+ "SAD": "SAD",
151
+ "ANGRY": "ANGRY",
152
+ "NEUTRAL": "NEUTRAL",
153
+ }
154
+
155
+ TTS_Vocal_Token = {
156
+ "TTS/B": "TTS/B",
157
+ "TTS/O": "TTS/O",
158
+ "TTS/Q": "TTS/Q",
159
+ "TTS/A": "TTS/A",
160
+ "TTS/CO": "TTS/CO",
161
+ "TTS/CL": "TTS/CL",
162
+ "TTS/H": "TTS/H",
163
+ **{f"TTS/SP{i:02d}": f"TTS/SP{i:02d}" for i in range(1, 14)}
164
+ }
165
+
166
+
167
+ @lru_cache(maxsize=None)
168
+ def get_encoding(name: str = "gpt2", num_languages: int = 99):
169
+ vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
170
+ ranks = {
171
+ base64.b64decode(token): int(rank)
172
+ for token, rank in (line.split() for line in open(vocab_path) if line)
173
+ }
174
+ n_vocab = len(ranks)
175
+ special_tokens = {}
176
+
177
+ specials = [
178
+ "<|endoftext|>",
179
+ "<|startoftranscript|>",
180
+ *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
181
+ *[f"<|{audio_event}|>" for audio_event in list(AUDIO_EVENT.keys())],
182
+ *[f"<|{emotion}|>" for emotion in list(EMOTION.keys())],
183
+ "<|translate|>",
184
+ "<|transcribe|>",
185
+ "<|startoflm|>",
186
+ "<|startofprev|>",
187
+ "<|nospeech|>",
188
+ "<|notimestamps|>",
189
+ *[f"<|SPECIAL_TOKEN_{i}|>" for i in range(1, 31)], # register special tokens for ASR
190
+ *[f"<|{tts}|>" for tts in list(TTS_Vocal_Token.keys())], # register special tokens for TTS
191
+ *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
192
+ ]
193
+
194
+ for token in specials:
195
+ special_tokens[token] = n_vocab
196
+ n_vocab += 1
197
+
198
+ return tiktoken.Encoding(
199
+ name=os.path.basename(vocab_path),
200
+ explicit_n_vocab=n_vocab,
201
+ pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
202
+ mergeable_ranks=ranks,
203
+ special_tokens=special_tokens,
204
+ )
205
+
206
+
207
+ @lru_cache(maxsize=None)
208
+ def get_tokenizer(
209
+ multilingual: bool,
210
+ *,
211
+ num_languages: int = 99,
212
+ language: Optional[str] = None,
213
+ task: Optional[str] = None, # Literal["transcribe", "translate", None]
214
+ ) -> Tokenizer:
215
+ if language is not None:
216
+ language = language.lower()
217
+ if language not in LANGUAGES:
218
+ if language in TO_LANGUAGE_CODE:
219
+ language = TO_LANGUAGE_CODE[language]
220
+ else:
221
+ raise ValueError(f"Unsupported language: {language}")
222
+
223
+ if multilingual:
224
+ encoding_name = "multilingual_zh_ja_yue_char_del"
225
+ language = language or "en"
226
+ task = task or "transcribe"
227
+ else:
228
+ encoding_name = "gpt2"
229
+ language = None
230
+ task = None
231
+
232
+ encoding = get_encoding(name=encoding_name, num_languages=num_languages)
233
+
234
+ return Tokenizer(
235
+ encoding=encoding, num_languages=num_languages, language=language, task=task
236
+ )
cosyvoice/utils/common.py CHANGED
@@ -15,8 +15,10 @@
15
  # Modified from ESPnet(https://github.com/espnet/espnet)
16
  """Unility functions for Transformer."""
17
 
 
18
  from typing import List
19
 
 
20
  import torch
21
 
22
  IGNORE_ID = -1
@@ -142,3 +144,10 @@ def fade_in_out(fade_in_mel, fade_out_mel, window):
142
  fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
143
  fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
144
  return fade_in_mel.to(device)
 
 
 
 
 
 
 
 
15
  # Modified from ESPnet(https://github.com/espnet/espnet)
16
  """Unility functions for Transformer."""
17
 
18
+ import random
19
  from typing import List
20
 
21
+ import numpy as np
22
  import torch
23
 
24
  IGNORE_ID = -1
 
144
  fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
145
  fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
146
  return fade_in_mel.to(device)
147
+
148
+
149
+ def set_all_random_seed(seed):
150
+ random.seed(seed)
151
+ np.random.seed(seed)
152
+ torch.manual_seed(seed)
153
+ torch.cuda.manual_seed_all(seed)
webui.py CHANGED
@@ -24,6 +24,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
24
  sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
25
  from cosyvoice.cli.cosyvoice import CosyVoice
26
  from cosyvoice.utils.file_utils import load_wav, logging
 
27
 
28
  inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
29
  instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
@@ -42,13 +43,6 @@ def generate_seed():
42
  }
43
 
44
 
45
- def set_all_random_seed(seed):
46
- random.seed(seed)
47
- np.random.seed(seed)
48
- torch.manual_seed(seed)
49
- torch.cuda.manual_seed_all(seed)
50
-
51
-
52
  def postprocess(speech, top_db=60, hop_length=220, win_length=440):
53
  speech, _ = librosa.effects.trim(
54
  speech, top_db=top_db,
 
24
  sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
25
  from cosyvoice.cli.cosyvoice import CosyVoice
26
  from cosyvoice.utils.file_utils import load_wav, logging
27
+ from cosyvoice.utils.common import set_all_random_seed
28
 
29
  inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
30
  instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
 
43
  }
44
 
45
 
 
 
 
 
 
 
 
46
  def postprocess(speech, top_db=60, hop_length=220, win_length=440):
47
  speech, _ = librosa.effects.trim(
48
  speech, top_db=top_db,