CosyVoice commited on
Commit
9ab298d
1 Parent(s): f4e70e2

add llm export script

Browse files
cosyvoice/cli/cosyvoice.py CHANGED
@@ -21,7 +21,7 @@ from cosyvoice.utils.file_utils import logging
21
 
22
  class CosyVoice:
23
 
24
- def __init__(self, model_dir):
25
  instruct = True if '-Instruct' in model_dir else False
26
  self.model_dir = model_dir
27
  if not os.path.exists(model_dir):
@@ -39,6 +39,9 @@ class CosyVoice:
39
  self.model.load('{}/llm.pt'.format(model_dir),
40
  '{}/flow.pt'.format(model_dir),
41
  '{}/hift.pt'.format(model_dir))
 
 
 
42
  del configs
43
 
44
  def list_avaliable_spks(self):
 
21
 
22
  class CosyVoice:
23
 
24
+ def __init__(self, model_dir, load_script=True):
25
  instruct = True if '-Instruct' in model_dir else False
26
  self.model_dir = model_dir
27
  if not os.path.exists(model_dir):
 
39
  self.model.load('{}/llm.pt'.format(model_dir),
40
  '{}/flow.pt'.format(model_dir),
41
  '{}/hift.pt'.format(model_dir))
42
+ if load_script:
43
+ self.model.load_script('{}/llm.text_encoder.fp16.zip'.format(model_dir),
44
+ '{}/llm.llm.fp16.zip'.format(model_dir))
45
  del configs
46
 
47
  def list_avaliable_spks(self):
cosyvoice/cli/model.py CHANGED
@@ -47,11 +47,18 @@ class CosyVoiceModel:
47
  def load(self, llm_model, flow_model, hift_model):
48
  self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
49
  self.llm.to(self.device).eval()
 
50
  self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
51
  self.flow.to(self.device).eval()
52
  self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
53
  self.hift.to(self.device).eval()
54
 
 
 
 
 
 
 
55
  def llm_job(self, text, text_len, prompt_text, prompt_text_len, llm_prompt_speech_token, llm_prompt_speech_token_len, llm_embedding, this_uuid):
56
  with self.llm_context:
57
  for i in self.llm.inference(text=text.to(self.device),
@@ -60,7 +67,7 @@ class CosyVoiceModel:
60
  prompt_text_len=prompt_text_len.to(self.device),
61
  prompt_speech_token=llm_prompt_speech_token.to(self.device),
62
  prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
63
- embedding=llm_embedding.to(self.device),
64
  sampling=25,
65
  max_token_text_ratio=30,
66
  min_token_text_ratio=3):
 
47
  def load(self, llm_model, flow_model, hift_model):
48
  self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
49
  self.llm.to(self.device).eval()
50
+ self.llm.half()
51
  self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
52
  self.flow.to(self.device).eval()
53
  self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
54
  self.hift.to(self.device).eval()
55
 
56
+ def load_script(self, llm_text_encoder_model, llm_llm_model):
57
+ llm_text_encoder = torch.jit.load(llm_text_encoder_model)
58
+ self.llm.text_encoder = llm_text_encoder
59
+ llm_llm = torch.jit.load(llm_llm_model)
60
+ self.llm.llm = llm_llm
61
+
62
  def llm_job(self, text, text_len, prompt_text, prompt_text_len, llm_prompt_speech_token, llm_prompt_speech_token_len, llm_embedding, this_uuid):
63
  with self.llm_context:
64
  for i in self.llm.inference(text=text.to(self.device),
 
67
  prompt_text_len=prompt_text_len.to(self.device),
68
  prompt_speech_token=llm_prompt_speech_token.to(self.device),
69
  prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
70
+ embedding=llm_embedding.to(self.device).half(),
71
  sampling=25,
72
  max_token_text_ratio=30,
73
  min_token_text_ratio=3):
cosyvoice/transformer/attention.py CHANGED
@@ -222,7 +222,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
222
  torch.nn.init.xavier_uniform_(self.pos_bias_u)
223
  torch.nn.init.xavier_uniform_(self.pos_bias_v)
224
 
225
- def rel_shift(self, x):
226
  """Compute relative positional encoding.
227
 
228
  Args:
@@ -233,10 +233,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
233
  torch.Tensor: Output tensor.
234
 
235
  """
236
- zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
 
 
237
  x_padded = torch.cat([zero_pad, x], dim=-1)
238
 
239
- x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
 
 
240
  x = x_padded[:, :, 1:].view_as(x)[
241
  :, :, :, : x.size(-1) // 2 + 1
242
  ] # only keep the positions from 0 to time2
 
222
  torch.nn.init.xavier_uniform_(self.pos_bias_u)
223
  torch.nn.init.xavier_uniform_(self.pos_bias_v)
224
 
225
+ def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
226
  """Compute relative positional encoding.
227
 
228
  Args:
 
233
  torch.Tensor: Output tensor.
234
 
235
  """
236
+ zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
237
+ device=x.device,
238
+ dtype=x.dtype)
239
  x_padded = torch.cat([zero_pad, x], dim=-1)
240
 
241
+ x_padded = x_padded.view(x.size()[0],
242
+ x.size()[1],
243
+ x.size(3) + 1, x.size(2))
244
  x = x_padded[:, :, 1:].view_as(x)[
245
  :, :, :, : x.size(-1) // 2 + 1
246
  ] # only keep the positions from 0 to time2
cosyvoice/transformer/decoder.py CHANGED
@@ -174,7 +174,7 @@ class TransformerDecoder(torch.nn.Module):
174
  memory_mask)
175
  return x
176
 
177
- @torch.jit.ignore(drop=True)
178
  def forward_layers_checkpointed(self, x: torch.Tensor,
179
  tgt_mask: torch.Tensor,
180
  memory: torch.Tensor,
 
174
  memory_mask)
175
  return x
176
 
177
+ @torch.jit.unused
178
  def forward_layers_checkpointed(self, x: torch.Tensor,
179
  tgt_mask: torch.Tensor,
180
  memory: torch.Tensor,
cosyvoice/transformer/embedding.py CHANGED
@@ -212,7 +212,7 @@ class EspnetRelPositionalEncoding(torch.nn.Module):
212
 
213
  """
214
 
215
- def __init__(self, d_model, dropout_rate, max_len=5000):
216
  """Construct an PositionalEncoding object."""
217
  super(EspnetRelPositionalEncoding, self).__init__()
218
  self.d_model = d_model
@@ -221,7 +221,7 @@ class EspnetRelPositionalEncoding(torch.nn.Module):
221
  self.pe = None
222
  self.extend_pe(torch.tensor(0.0).expand(1, max_len))
223
 
224
- def extend_pe(self, x):
225
  """Reset the positional encodings."""
226
  if self.pe is not None:
227
  # self.pe contains both positive and negative parts
@@ -253,7 +253,8 @@ class EspnetRelPositionalEncoding(torch.nn.Module):
253
  pe = torch.cat([pe_positive, pe_negative], dim=1)
254
  self.pe = pe.to(device=x.device, dtype=x.dtype)
255
 
256
- def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0):
 
257
  """Add positional encoding.
258
 
259
  Args:
 
212
 
213
  """
214
 
215
+ def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000):
216
  """Construct an PositionalEncoding object."""
217
  super(EspnetRelPositionalEncoding, self).__init__()
218
  self.d_model = d_model
 
221
  self.pe = None
222
  self.extend_pe(torch.tensor(0.0).expand(1, max_len))
223
 
224
+ def extend_pe(self, x: torch.Tensor):
225
  """Reset the positional encodings."""
226
  if self.pe is not None:
227
  # self.pe contains both positive and negative parts
 
253
  pe = torch.cat([pe_positive, pe_negative], dim=1)
254
  self.pe = pe.to(device=x.device, dtype=x.dtype)
255
 
256
+ def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0) \
257
+ -> Tuple[torch.Tensor, torch.Tensor]:
258
  """Add positional encoding.
259
 
260
  Args:
cosyvoice/transformer/encoder.py CHANGED
@@ -169,7 +169,7 @@ class BaseEncoder(torch.nn.Module):
169
  xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
170
  return xs
171
 
172
- @torch.jit.ignore(drop=True)
173
  def forward_layers_checkpointed(self, xs: torch.Tensor,
174
  chunk_masks: torch.Tensor,
175
  pos_emb: torch.Tensor,
@@ -180,6 +180,7 @@ class BaseEncoder(torch.nn.Module):
180
  mask_pad)
181
  return xs
182
 
 
183
  def forward_chunk(
184
  self,
185
  xs: torch.Tensor,
@@ -270,6 +271,7 @@ class BaseEncoder(torch.nn.Module):
270
 
271
  return (xs, r_att_cache, r_cnn_cache)
272
 
 
273
  def forward_chunk_by_chunk(
274
  self,
275
  xs: torch.Tensor,
 
169
  xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
170
  return xs
171
 
172
+ @torch.jit.unused
173
  def forward_layers_checkpointed(self, xs: torch.Tensor,
174
  chunk_masks: torch.Tensor,
175
  pos_emb: torch.Tensor,
 
180
  mask_pad)
181
  return xs
182
 
183
+ @torch.jit.export
184
  def forward_chunk(
185
  self,
186
  xs: torch.Tensor,
 
271
 
272
  return (xs, r_att_cache, r_cnn_cache)
273
 
274
+ @torch.jit.unused
275
  def forward_chunk_by_chunk(
276
  self,
277
  xs: torch.Tensor,
examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml CHANGED
@@ -31,7 +31,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
31
  num_blocks: 3
32
  dropout_rate: 0.1
33
  positional_dropout_rate: 0.1
34
- attention_dropout_rate: 0
35
  normalize_before: True
36
  input_layer: 'linear'
37
  pos_enc_layer_type: 'rel_pos_espnet'
@@ -49,7 +49,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
49
  num_blocks: 7
50
  dropout_rate: 0.1
51
  positional_dropout_rate: 0.1
52
- attention_dropout_rate: 0
53
  input_layer: 'linear_legacy'
54
  pos_enc_layer_type: 'rel_pos_espnet'
55
  selfattention_layer_type: 'rel_selfattn'
@@ -102,7 +102,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
102
  in_channels: 320
103
  out_channels: 80
104
  channels: [256, 256]
105
- dropout: 0
106
  attention_head_dim: 64
107
  n_blocks: 4
108
  num_mid_blocks: 8
 
31
  num_blocks: 3
32
  dropout_rate: 0.1
33
  positional_dropout_rate: 0.1
34
+ attention_dropout_rate: 0.0
35
  normalize_before: True
36
  input_layer: 'linear'
37
  pos_enc_layer_type: 'rel_pos_espnet'
 
49
  num_blocks: 7
50
  dropout_rate: 0.1
51
  positional_dropout_rate: 0.1
52
+ attention_dropout_rate: 0.0
53
  input_layer: 'linear_legacy'
54
  pos_enc_layer_type: 'rel_pos_espnet'
55
  selfattention_layer_type: 'rel_selfattn'
 
102
  in_channels: 320
103
  out_channels: 80
104
  channels: [256, 256]
105
+ dropout: 0.0
106
  attention_head_dim: 64
107
  n_blocks: 4
108
  num_mid_blocks: 8
examples/libritts/cosyvoice/conf/cosyvoice.yaml CHANGED
@@ -31,7 +31,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
31
  num_blocks: 6
32
  dropout_rate: 0.1
33
  positional_dropout_rate: 0.1
34
- attention_dropout_rate: 0
35
  normalize_before: True
36
  input_layer: 'linear'
37
  pos_enc_layer_type: 'rel_pos_espnet'
@@ -49,7 +49,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
49
  num_blocks: 14
50
  dropout_rate: 0.1
51
  positional_dropout_rate: 0.1
52
- attention_dropout_rate: 0
53
  input_layer: 'linear_legacy'
54
  pos_enc_layer_type: 'rel_pos_espnet'
55
  selfattention_layer_type: 'rel_selfattn'
@@ -102,7 +102,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
102
  in_channels: 320
103
  out_channels: 80
104
  channels: [256, 256]
105
- dropout: 0
106
  attention_head_dim: 64
107
  n_blocks: 4
108
  num_mid_blocks: 12
 
31
  num_blocks: 6
32
  dropout_rate: 0.1
33
  positional_dropout_rate: 0.1
34
+ attention_dropout_rate: 0.0
35
  normalize_before: True
36
  input_layer: 'linear'
37
  pos_enc_layer_type: 'rel_pos_espnet'
 
49
  num_blocks: 14
50
  dropout_rate: 0.1
51
  positional_dropout_rate: 0.1
52
+ attention_dropout_rate: 0.0
53
  input_layer: 'linear_legacy'
54
  pos_enc_layer_type: 'rel_pos_espnet'
55
  selfattention_layer_type: 'rel_selfattn'
 
102
  in_channels: 320
103
  out_channels: 80
104
  channels: [256, 256]
105
+ dropout: 0.0
106
  attention_head_dim: 64
107
  n_blocks: 4
108
  num_mid_blocks: 12
webui.py CHANGED
@@ -173,7 +173,7 @@ if __name__ == '__main__':
173
  default=8000)
174
  parser.add_argument('--model_dir',
175
  type=str,
176
- default='iic/CosyVoice-300M',
177
  help='local path or modelscope repo id')
178
  args = parser.parse_args()
179
  cosyvoice = CosyVoice(args.model_dir)
 
173
  default=8000)
174
  parser.add_argument('--model_dir',
175
  type=str,
176
+ default='pretrained_models/CosyVoice-300M',
177
  help='local path or modelscope repo id')
178
  args = parser.parse_args()
179
  cosyvoice = CosyVoice(args.model_dir)