CosyVoice commited on
Commit
49015f6
1 Parent(s): ed87445

add vc code

Browse files
cosyvoice/cli/cosyvoice.py CHANGED
@@ -25,6 +25,7 @@ class CosyVoice:
25
 
26
  def __init__(self, model_dir, load_jit=True, load_onnx=False):
27
  instruct = True if '-Instruct' in model_dir else False
 
28
  self.model_dir = model_dir
29
  if not os.path.exists(model_dir):
30
  model_dir = snapshot_download(model_dir)
@@ -36,6 +37,7 @@ class CosyVoice:
36
  '{}/speech_tokenizer_v1.onnx'.format(model_dir),
37
  '{}/spk2info.pt'.format(model_dir),
38
  instruct,
 
39
  configs['allowed_special'])
40
  self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
41
  self.model.load('{}/llm.pt'.format(model_dir),
@@ -58,7 +60,7 @@ class CosyVoice:
58
  model_input = self.frontend.frontend_sft(i, spk_id)
59
  start_time = time.time()
60
  logging.info('synthesis text {}'.format(i))
61
- for model_output in self.model.inference(**model_input, stream=stream, speed=speed):
62
  speech_len = model_output['tts_speech'].shape[1] / 22050
63
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
64
  yield model_output
@@ -70,7 +72,7 @@ class CosyVoice:
70
  model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
71
  start_time = time.time()
72
  logging.info('synthesis text {}'.format(i))
73
- for model_output in self.model.inference(**model_input, stream=stream, speed=speed):
74
  speech_len = model_output['tts_speech'].shape[1] / 22050
75
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
76
  yield model_output
@@ -83,7 +85,7 @@ class CosyVoice:
83
  model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
84
  start_time = time.time()
85
  logging.info('synthesis text {}'.format(i))
86
- for model_output in self.model.inference(**model_input, stream=stream, speed=speed):
87
  speech_len = model_output['tts_speech'].shape[1] / 22050
88
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
89
  yield model_output
@@ -97,8 +99,17 @@ class CosyVoice:
97
  model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
98
  start_time = time.time()
99
  logging.info('synthesis text {}'.format(i))
100
- for model_output in self.model.inference(**model_input, stream=stream, speed=speed):
101
  speech_len = model_output['tts_speech'].shape[1] / 22050
102
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
103
  yield model_output
104
  start_time = time.time()
 
 
 
 
 
 
 
 
 
 
25
 
26
  def __init__(self, model_dir, load_jit=True, load_onnx=False):
27
  instruct = True if '-Instruct' in model_dir else False
28
+ vc = True if '-VC' in model_dir else False
29
  self.model_dir = model_dir
30
  if not os.path.exists(model_dir):
31
  model_dir = snapshot_download(model_dir)
 
37
  '{}/speech_tokenizer_v1.onnx'.format(model_dir),
38
  '{}/spk2info.pt'.format(model_dir),
39
  instruct,
40
+ vc,
41
  configs['allowed_special'])
42
  self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
43
  self.model.load('{}/llm.pt'.format(model_dir),
 
60
  model_input = self.frontend.frontend_sft(i, spk_id)
61
  start_time = time.time()
62
  logging.info('synthesis text {}'.format(i))
63
+ for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
64
  speech_len = model_output['tts_speech'].shape[1] / 22050
65
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
66
  yield model_output
 
72
  model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
73
  start_time = time.time()
74
  logging.info('synthesis text {}'.format(i))
75
+ for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
76
  speech_len = model_output['tts_speech'].shape[1] / 22050
77
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
78
  yield model_output
 
85
  model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
86
  start_time = time.time()
87
  logging.info('synthesis text {}'.format(i))
88
+ for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
89
  speech_len = model_output['tts_speech'].shape[1] / 22050
90
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
91
  yield model_output
 
99
  model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
100
  start_time = time.time()
101
  logging.info('synthesis text {}'.format(i))
102
+ for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
103
  speech_len = model_output['tts_speech'].shape[1] / 22050
104
  logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
105
  yield model_output
106
  start_time = time.time()
107
+
108
+ def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
109
+ model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k)
110
+ start_time = time.time()
111
+ for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
112
+ speech_len = model_output['tts_speech'].shape[1] / 22050
113
+ logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
114
+ yield model_output
115
+ start_time = time.time()
cosyvoice/cli/frontend.py CHANGED
@@ -42,6 +42,7 @@ class CosyVoiceFrontEnd:
42
  speech_tokenizer_model: str,
43
  spk2info: str = '',
44
  instruct: bool = False,
 
45
  allowed_special: str = 'all'):
46
  self.tokenizer = get_tokenizer()
47
  self.feat_extractor = feat_extractor
@@ -55,7 +56,10 @@ class CosyVoiceFrontEnd:
55
  "CPUExecutionProvider"])
56
  if os.path.exists(spk2info):
57
  self.spk2info = torch.load(spk2info, map_location=self.device)
 
 
58
  self.instruct = instruct
 
59
  self.allowed_special = allowed_special
60
  self.inflect_parser = inflect.engine()
61
  self.use_ttsfrd = use_ttsfrd
@@ -172,3 +176,15 @@ class CosyVoiceFrontEnd:
172
  model_input['prompt_text'] = instruct_text_token
173
  model_input['prompt_text_len'] = instruct_text_token_len
174
  return model_input
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  speech_tokenizer_model: str,
43
  spk2info: str = '',
44
  instruct: bool = False,
45
+ vc: bool = False,
46
  allowed_special: str = 'all'):
47
  self.tokenizer = get_tokenizer()
48
  self.feat_extractor = feat_extractor
 
56
  "CPUExecutionProvider"])
57
  if os.path.exists(spk2info):
58
  self.spk2info = torch.load(spk2info, map_location=self.device)
59
+ else:
60
+ self.spk2info = {}
61
  self.instruct = instruct
62
+ self.vc = vc
63
  self.allowed_special = allowed_special
64
  self.inflect_parser = inflect.engine()
65
  self.use_ttsfrd = use_ttsfrd
 
176
  model_input['prompt_text'] = instruct_text_token
177
  model_input['prompt_text_len'] = instruct_text_token_len
178
  return model_input
179
+
180
+ def frontend_vc(self, source_speech_16k, prompt_speech_16k):
181
+ prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
182
+ prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
183
+ prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
184
+ embedding = self._extract_spk_embedding(prompt_speech_16k)
185
+ source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
186
+ model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
187
+ 'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
188
+ 'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
189
+ 'flow_embedding': embedding}
190
+ return model_input
cosyvoice/flow/flow.py CHANGED
@@ -124,7 +124,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
124
  # text encode
125
  h, h_lengths = self.encoder(token, token_len)
126
  h = self.encoder_proj(h)
127
- mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / 50 * 22050 / 256)
128
  h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2)
129
 
130
  # get conditions
@@ -132,7 +132,6 @@ class MaskedDiffWithXvec(torch.nn.Module):
132
  conds[:, :mel_len1] = prompt_feat
133
  conds = conds.transpose(1, 2)
134
 
135
- # mask = (~make_pad_mask(feat_len)).to(h)
136
  mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
137
  feat = self.decoder(
138
  mu=h.transpose(1, 2).contiguous(),
 
124
  # text encode
125
  h, h_lengths = self.encoder(token, token_len)
126
  h = self.encoder_proj(h)
127
+ mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / self.input_frame_rate * 22050 / 256)
128
  h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2)
129
 
130
  # get conditions
 
132
  conds[:, :mel_len1] = prompt_feat
133
  conds = conds.transpose(1, 2)
134
 
 
135
  mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
136
  feat = self.decoder(
137
  mu=h.transpose(1, 2).contiguous(),
cosyvoice/llm/llm.py CHANGED
@@ -206,7 +206,7 @@ class TransformerLM(torch.nn.Module):
206
  if top_ids == self.speech_token_size:
207
  break
208
  # in stream mode, yield token one by one
209
- yield torch.tensor([[top_ids]], dtype=torch.int64, device=device)
210
  out_tokens.append(top_ids)
211
  offset += lm_input.size(1)
212
  lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
 
206
  if top_ids == self.speech_token_size:
207
  break
208
  # in stream mode, yield token one by one
209
+ yield top_ids
210
  out_tokens.append(top_ids)
211
  offset += lm_input.size(1)
212
  lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
cosyvoice/tokenizer/tokenizer.py CHANGED
@@ -4,6 +4,7 @@ import string
4
  from dataclasses import dataclass, field
5
  from functools import cached_property, lru_cache
6
  from typing import Dict, List, Optional, Tuple
 
7
 
8
  import tiktoken
9
 
@@ -165,208 +166,6 @@ TTS_Vocal_Token = {
165
  }
166
 
167
 
168
- @dataclass
169
- class Tokenizer:
170
- """A thin wrapper around `tiktoken` providing quick access to special tokens"""
171
-
172
- encoding: tiktoken.Encoding
173
- num_languages: int
174
- language: Optional[str] = None
175
- task: Optional[str] = None
176
- sot_sequence: Tuple[int] = ()
177
- special_tokens: Dict[str, int] = field(default_factory=dict)
178
-
179
- def __post_init__(self):
180
- for special in self.encoding.special_tokens_set:
181
- special_token = self.encoding.encode_single_token(special)
182
- self.special_tokens[special] = special_token
183
-
184
- sot: int = self.special_tokens["<|startoftranscript|>"]
185
- translate: int = self.special_tokens["<|translate|>"]
186
- transcribe: int = self.special_tokens["<|transcribe|>"]
187
-
188
- langs = tuple(LANGUAGES.keys())[: self.num_languages]
189
- sot_sequence = [sot]
190
- if self.language is not None:
191
- sot_sequence.append(sot + 1 + langs.index(self.language))
192
- if self.task is not None:
193
- task_token: int = transcribe if self.task == "transcribe" else translate
194
- sot_sequence.append(task_token)
195
-
196
- self.sot_sequence = tuple(sot_sequence)
197
-
198
- def encode(self, text, **kwargs):
199
- return self.encoding.encode(text, **kwargs)
200
-
201
- def decode(self, token_ids: List[int], **kwargs) -> str:
202
- token_ids = [t for t in token_ids if t < self.timestamp_begin]
203
- return self.encoding.decode(token_ids, **kwargs)
204
-
205
- def decode_with_timestamps(self, token_ids: List[int], **kwargs) -> str:
206
- """
207
- Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
208
- This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
209
- """
210
- return self.encoding.decode(token_ids, **kwargs)
211
-
212
- def get_vocab_size(self) -> int:
213
- return self.encoding.n_vocab
214
-
215
- @cached_property
216
- def eot(self) -> int:
217
- return self.encoding.eot_token
218
-
219
- @cached_property
220
- def transcribe(self) -> int:
221
- return self.special_tokens["<|transcribe|>"]
222
-
223
- @cached_property
224
- def translate(self) -> int:
225
- return self.special_tokens["<|translate|>"]
226
-
227
- @cached_property
228
- def sot(self) -> int:
229
- return self.special_tokens["<|startoftranscript|>"]
230
-
231
- @cached_property
232
- def sot_lm(self) -> int:
233
- return self.special_tokens["<|startoflm|>"]
234
-
235
- @cached_property
236
- def sot_prev(self) -> int:
237
- return self.special_tokens["<|startofprev|>"]
238
-
239
- @cached_property
240
- def no_speech(self) -> int:
241
- return self.special_tokens["<|nospeech|>"]
242
-
243
- @cached_property
244
- def no_timestamps(self) -> int:
245
- return self.special_tokens["<|notimestamps|>"]
246
-
247
- @cached_property
248
- def timestamp_begin(self) -> int:
249
- return self.special_tokens["<|0.00|>"]
250
-
251
- @cached_property
252
- def language_token(self) -> int:
253
- """Returns the token id corresponding to the value of the `language` field"""
254
- if self.language is None:
255
- raise ValueError("This tokenizer does not have language token configured")
256
-
257
- return self.to_language_token(self.language)
258
-
259
- def to_language_token(self, language):
260
- if token := self.special_tokens.get(f"<|{language}|>", None):
261
- return token
262
-
263
- raise KeyError(f"Language {language} not found in tokenizer.")
264
-
265
- @cached_property
266
- def all_language_tokens(self) -> Tuple[int]:
267
- result = []
268
- for token, token_id in self.special_tokens.items():
269
- if token.strip("<|>") in LANGUAGES:
270
- result.append(token_id)
271
- return tuple(result)[: self.num_languages]
272
-
273
- @cached_property
274
- def all_language_codes(self) -> Tuple[str]:
275
- return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens)
276
-
277
- @cached_property
278
- def sot_sequence_including_notimestamps(self) -> Tuple[int]:
279
- return tuple(list(self.sot_sequence) + [self.no_timestamps])
280
-
281
- @cached_property
282
- def non_speech_tokens(self) -> Tuple[int]:
283
- """
284
- Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
285
- annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
286
-
287
- - ♪♪♪
288
- - ( SPEAKING FOREIGN LANGUAGE )
289
- - [DAVID] Hey there,
290
-
291
- keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
292
- """
293
- symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
294
- symbols += (
295
- "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
296
- )
297
-
298
- # symbols that may be a single token or multiple tokens depending on the tokenizer.
299
- # In case they're multiple tokens, suppress the first token, which is safe because:
300
- # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
301
- # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
302
- miscellaneous = set("♩♪♫♬♭♮♯")
303
- assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
304
-
305
- # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
306
- result = {self.encoding.encode(" -")[0], self.encoding.encode(" '")[0]}
307
- for symbol in symbols + list(miscellaneous):
308
- for tokens in [
309
- self.encoding.encode(symbol),
310
- self.encoding.encode(" " + symbol),
311
- ]:
312
- if len(tokens) == 1 or symbol in miscellaneous:
313
- result.add(tokens[0])
314
-
315
- return tuple(sorted(result))
316
-
317
- def split_to_word_tokens(self, tokens: List[int]):
318
- if self.language in {"zh", "ja", "th", "lo", "my", "yue"}:
319
- # These languages don't typically use spaces, so it is difficult to split words
320
- # without morpheme analysis. Here, we instead split words at any
321
- # position where the tokens are decoded as valid unicode points
322
- return self.split_tokens_on_unicode(tokens)
323
-
324
- return self.split_tokens_on_spaces(tokens)
325
-
326
- def split_tokens_on_unicode(self, tokens: List[int]):
327
- decoded_full = self.decode_with_timestamps(tokens)
328
- replacement_char = "\ufffd"
329
-
330
- words = []
331
- word_tokens = []
332
- current_tokens = []
333
- unicode_offset = 0
334
-
335
- for token in tokens:
336
- current_tokens.append(token)
337
- decoded = self.decode_with_timestamps(current_tokens)
338
-
339
- if (
340
- replacement_char not in decoded
341
- or decoded_full[unicode_offset + decoded.index(replacement_char)]
342
- == replacement_char
343
- ):
344
- words.append(decoded)
345
- word_tokens.append(current_tokens)
346
- current_tokens = []
347
- unicode_offset += len(decoded)
348
-
349
- return words, word_tokens
350
-
351
- def split_tokens_on_spaces(self, tokens: List[int]):
352
- subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
353
- words = []
354
- word_tokens = []
355
-
356
- for subword, subword_tokens in zip(subwords, subword_tokens_list):
357
- special = subword_tokens[0] >= self.eot
358
- with_space = subword.startswith(" ")
359
- punctuation = subword.strip() in string.punctuation
360
- if special or with_space or punctuation or len(words) == 0:
361
- words.append(subword)
362
- word_tokens.append(subword_tokens)
363
- else:
364
- words[-1] = words[-1] + subword
365
- word_tokens[-1].extend(subword_tokens)
366
-
367
- return words, word_tokens
368
-
369
-
370
  @lru_cache(maxsize=None)
371
  def get_encoding(name: str = "gpt2", num_languages: int = 99):
372
  vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
 
4
  from dataclasses import dataclass, field
5
  from functools import cached_property, lru_cache
6
  from typing import Dict, List, Optional, Tuple
7
+ from whisper.tokenizer import Tokenizer
8
 
9
  import tiktoken
10
 
 
166
  }
167
 
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  @lru_cache(maxsize=None)
170
  def get_encoding(name: str = "gpt2", num_languages: int = 99):
171
  vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
cosyvoice/utils/common.py CHANGED
@@ -15,8 +15,10 @@
15
  # Modified from ESPnet(https://github.com/espnet/espnet)
16
  """Unility functions for Transformer."""
17
 
 
18
  from typing import List
19
 
 
20
  import torch
21
 
22
  IGNORE_ID = -1
@@ -142,3 +144,9 @@ def fade_in_out(fade_in_mel, fade_out_mel, window):
142
  fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
143
  fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
144
  return fade_in_mel.to(device)
 
 
 
 
 
 
 
15
  # Modified from ESPnet(https://github.com/espnet/espnet)
16
  """Unility functions for Transformer."""
17
 
18
+ import random
19
  from typing import List
20
 
21
+ import numpy as np
22
  import torch
23
 
24
  IGNORE_ID = -1
 
144
  fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
145
  fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
146
  return fade_in_mel.to(device)
147
+
148
+ def set_all_random_seed(seed):
149
+ random.seed(seed)
150
+ np.random.seed(seed)
151
+ torch.manual_seed(seed)
152
+ torch.cuda.manual_seed_all(seed)
webui.py CHANGED
@@ -24,6 +24,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
24
  sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
25
  from cosyvoice.cli.cosyvoice import CosyVoice
26
  from cosyvoice.utils.file_utils import load_wav, logging
 
27
 
28
  inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
29
  instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
@@ -42,13 +43,6 @@ def generate_seed():
42
  }
43
 
44
 
45
- def set_all_random_seed(seed):
46
- random.seed(seed)
47
- np.random.seed(seed)
48
- torch.manual_seed(seed)
49
- torch.cuda.manual_seed_all(seed)
50
-
51
-
52
  def postprocess(speech, top_db=60, hop_length=220, win_length=440):
53
  speech, _ = librosa.effects.trim(
54
  speech, top_db=top_db,
 
24
  sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
25
  from cosyvoice.cli.cosyvoice import CosyVoice
26
  from cosyvoice.utils.file_utils import load_wav, logging
27
+ from cosyvoice.utils.common import set_all_random_seed
28
 
29
  inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
30
  instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
 
43
  }
44
 
45
 
 
 
 
 
 
 
 
46
  def postprocess(speech, top_db=60, hop_length=220, win_length=440):
47
  speech, _ = librosa.effects.trim(
48
  speech, top_db=top_db,