CosyVoice commited on
Commit
0fd15bb
1 Parent(s): a723ea3

use spk_embedding when sft

Browse files
cosyvoice/flow/flow.py CHANGED
@@ -60,7 +60,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
60
  token_len = batch['speech_token_len'].to(device)
61
  feat = batch['speech_feat'].to(device)
62
  feat_len = batch['speech_feat_len'].to(device)
63
- embedding = batch['utt_embedding'].to(device)
64
 
65
  # xvec projection
66
  embedding = F.normalize(embedding, dim=1)
 
60
  token_len = batch['speech_token_len'].to(device)
61
  feat = batch['speech_feat'].to(device)
62
  feat_len = batch['speech_feat_len'].to(device)
63
+ embedding = batch['embedding'].to(device)
64
 
65
  # xvec projection
66
  embedding = F.normalize(embedding, dim=1)
cosyvoice/llm/llm.py CHANGED
@@ -97,7 +97,7 @@ class TransformerLM(torch.nn.Module):
97
  text_token_len = batch['text_token_len'].to(device)
98
  speech_token = batch['speech_token'].to(device)
99
  speech_token_len = batch['speech_token_len'].to(device)
100
- embedding = batch['utt_embedding'].to(device)
101
 
102
  # 1. prepare llm_target
103
  lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
 
97
  text_token_len = batch['text_token_len'].to(device)
98
  speech_token = batch['speech_token'].to(device)
99
  speech_token_len = batch['speech_token_len'].to(device)
100
+ embedding = batch['embedding'].to(device)
101
 
102
  # 1. prepare llm_target
103
  lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
cosyvoice/utils/executor.py CHANGED
@@ -52,6 +52,10 @@ class Executor:
52
  info_dict["batch_idx"] = batch_idx
53
  if cosyvoice_join(group_join, info_dict):
54
  break
 
 
 
 
55
 
56
  # Disable gradient synchronizations across DDP processes.
57
  # Within this context, gradients will be accumulated on module
 
52
  info_dict["batch_idx"] = batch_idx
53
  if cosyvoice_join(group_join, info_dict):
54
  break
55
+ if info_dict["use_spk_embedding"] is True:
56
+ batch_dict["embedding"] = batch_dict["spk_embedding"]
57
+ else:
58
+ batch_dict["embedding"] = batch_dict["utt_embedding"]
59
 
60
  # Disable gradient synchronizations across DDP processes.
61
  # Within this context, gradients will be accumulated on module
examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml CHANGED
@@ -190,6 +190,7 @@ train_conf:
190
  scheduler: warmuplr
191
  scheduler_conf:
192
  warmup_steps: 25000
 
193
  max_epoch: 200
194
  grad_clip: 5
195
  accum_grad: 2
 
190
  scheduler: warmuplr
191
  scheduler_conf:
192
  warmup_steps: 25000
193
+ use_spk_embedding: False # change to True during sft
194
  max_epoch: 200
195
  grad_clip: 5
196
  accum_grad: 2
examples/libritts/cosyvoice/conf/cosyvoice.yaml CHANGED
@@ -190,6 +190,7 @@ train_conf:
190
  scheduler: warmuplr # change to constantlr during sft
191
  scheduler_conf:
192
  warmup_steps: 2500
 
193
  max_epoch: 200
194
  grad_clip: 5
195
  accum_grad: 2
 
190
  scheduler: warmuplr # change to constantlr during sft
191
  scheduler_conf:
192
  warmup_steps: 2500
193
+ use_spk_embedding: False # change to True during sft
194
  max_epoch: 200
195
  grad_clip: 5
196
  accum_grad: 2