CosyVoice commited on
Commit
95051e5
2 Parent(s): 28f1353 f65eca6

Merge pull request #408 from FunAudioLLM/dev/lyuxiang.lx

Browse files
cosyvoice/cli/model.py CHANGED
@@ -40,6 +40,8 @@ class CosyVoiceModel:
40
  # hift cache
41
  self.mel_cache_len = 20
42
  self.source_cache_len = int(self.mel_cache_len * 256)
 
 
43
  # rtf and decoding related
44
  self.stream_scale_factor = 1
45
  assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
@@ -114,13 +116,19 @@ class CosyVoiceModel:
114
  self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
115
  tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
116
  tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
117
- self.hift_cache_dict[uuid] = {'source': tts_source[:, :, -self.source_cache_len:], 'mel': tts_mel[:, :, -self.mel_cache_len:]}
 
 
 
 
118
  tts_speech = tts_speech[:, :-self.source_cache_len]
119
  else:
120
  if speed != 1.0:
121
  assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
122
  tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
123
  tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
 
 
124
  return tts_speech
125
 
126
  def inference(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
 
40
  # hift cache
41
  self.mel_cache_len = 20
42
  self.source_cache_len = int(self.mel_cache_len * 256)
43
+ # speech fade in out
44
+ self.speech_window = np.hamming(2 * self.source_cache_len)
45
  # rtf and decoding related
46
  self.stream_scale_factor = 1
47
  assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
 
116
  self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
117
  tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
118
  tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
119
+ if self.hift_cache_dict[uuid] is not None:
120
+ tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
121
+ self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
122
+ 'source': tts_source[:, :, -self.source_cache_len:],
123
+ 'speech': tts_speech[:, -self.source_cache_len:]}
124
  tts_speech = tts_speech[:, :-self.source_cache_len]
125
  else:
126
  if speed != 1.0:
127
  assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
128
  tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
129
  tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
130
+ if self.hift_cache_dict[uuid] is not None:
131
+ tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
132
  return tts_speech
133
 
134
  def inference(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
cosyvoice/utils/common.py CHANGED
@@ -139,6 +139,6 @@ def fade_in_out(fade_in_mel, fade_out_mel, window):
139
  device = fade_in_mel.device
140
  fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
141
  mel_overlap_len = int(window.shape[0] / 2)
142
- fade_in_mel[:, :, :mel_overlap_len] = fade_in_mel[:, :, :mel_overlap_len] * window[:mel_overlap_len] + \
143
- fade_out_mel[:, :, -mel_overlap_len:] * window[mel_overlap_len:]
144
  return fade_in_mel.to(device)
 
139
  device = fade_in_mel.device
140
  fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
141
  mel_overlap_len = int(window.shape[0] / 2)
142
+ fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
143
+ fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
144
  return fade_in_mel.to(device)