Spaces:
Running
on
Zero
Running
on
Zero
Merge pull request #408 from FunAudioLLM/dev/lyuxiang.lx
Browse files- cosyvoice/cli/model.py +9 -1
- cosyvoice/utils/common.py +2 -2
cosyvoice/cli/model.py
CHANGED
@@ -40,6 +40,8 @@ class CosyVoiceModel:
|
|
40 |
# hift cache
|
41 |
self.mel_cache_len = 20
|
42 |
self.source_cache_len = int(self.mel_cache_len * 256)
|
|
|
|
|
43 |
# rtf and decoding related
|
44 |
self.stream_scale_factor = 1
|
45 |
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
|
@@ -114,13 +116,19 @@ class CosyVoiceModel:
|
|
114 |
self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
|
115 |
tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
|
116 |
tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
|
117 |
-
self.hift_cache_dict[uuid]
|
|
|
|
|
|
|
|
|
118 |
tts_speech = tts_speech[:, :-self.source_cache_len]
|
119 |
else:
|
120 |
if speed != 1.0:
|
121 |
assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
|
122 |
tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
|
123 |
tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
|
|
|
|
|
124 |
return tts_speech
|
125 |
|
126 |
def inference(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
|
|
|
40 |
# hift cache
|
41 |
self.mel_cache_len = 20
|
42 |
self.source_cache_len = int(self.mel_cache_len * 256)
|
43 |
+
# speech fade in out
|
44 |
+
self.speech_window = np.hamming(2 * self.source_cache_len)
|
45 |
# rtf and decoding related
|
46 |
self.stream_scale_factor = 1
|
47 |
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
|
|
|
116 |
self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
|
117 |
tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
|
118 |
tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
|
119 |
+
if self.hift_cache_dict[uuid] is not None:
|
120 |
+
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
|
121 |
+
self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
|
122 |
+
'source': tts_source[:, :, -self.source_cache_len:],
|
123 |
+
'speech': tts_speech[:, -self.source_cache_len:]}
|
124 |
tts_speech = tts_speech[:, :-self.source_cache_len]
|
125 |
else:
|
126 |
if speed != 1.0:
|
127 |
assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
|
128 |
tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
|
129 |
tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
|
130 |
+
if self.hift_cache_dict[uuid] is not None:
|
131 |
+
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
|
132 |
return tts_speech
|
133 |
|
134 |
def inference(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
|
cosyvoice/utils/common.py
CHANGED
@@ -139,6 +139,6 @@ def fade_in_out(fade_in_mel, fade_out_mel, window):
|
|
139 |
device = fade_in_mel.device
|
140 |
fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
|
141 |
mel_overlap_len = int(window.shape[0] / 2)
|
142 |
-
fade_in_mel[
|
143 |
-
fade_out_mel[
|
144 |
return fade_in_mel.to(device)
|
|
|
139 |
device = fade_in_mel.device
|
140 |
fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
|
141 |
mel_overlap_len = int(window.shape[0] / 2)
|
142 |
+
fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
|
143 |
+
fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
|
144 |
return fade_in_mel.to(device)
|