jbetker commited on
Commit
b07fb37
1 Parent(s): b1ba841

Clip diffusion inputs

Browse files
Files changed (1) hide show
  1. api.py +15 -2
api.py CHANGED
@@ -181,6 +181,7 @@ class TextToSpeech:
181
  samples = []
182
  num_batches = num_autoregressive_samples // self.autoregressive_batch_size
183
  stop_mel_token = self.autoregressive.stop_mel_token
 
184
  self.autoregressive = self.autoregressive.cuda()
185
  for b in tqdm(range(num_batches)):
186
  codes = self.autoregressive.inference_speech(conds, text,
@@ -212,8 +213,20 @@ class TextToSpeech:
212
  self.diffusion = self.diffusion.cuda()
213
  self.vocoder = self.vocoder.cuda()
214
  for b in range(best_results.shape[0]):
215
- code = best_results[b].unsqueeze(0)
216
- mel = do_spectrogram_diffusion(self.diffusion, diffuser, code, voice_samples, temperature=diffusion_temperature)
 
 
 
 
 
 
 
 
 
 
 
 
217
  wav = self.vocoder.inference(mel)
218
  wav_candidates.append(wav.cpu())
219
  self.diffusion = self.diffusion.cpu()
 
181
  samples = []
182
  num_batches = num_autoregressive_samples // self.autoregressive_batch_size
183
  stop_mel_token = self.autoregressive.stop_mel_token
184
+ calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
185
  self.autoregressive = self.autoregressive.cuda()
186
  for b in tqdm(range(num_batches)):
187
  codes = self.autoregressive.inference_speech(conds, text,
 
213
  self.diffusion = self.diffusion.cuda()
214
  self.vocoder = self.vocoder.cuda()
215
  for b in range(best_results.shape[0]):
216
+ codes = best_results[b].unsqueeze(0)
217
+
218
+ # Find the first occurrence of the "calm" token and trim the codes to that.
219
+ ctokens = 0
220
+ for k in range(codes.shape[-1]):
221
+ if codes[0, k] == calm_token:
222
+ ctokens += 1
223
+ else:
224
+ ctokens = 0
225
+ if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
226
+ codes = codes[:, :k]
227
+ break
228
+
229
+ mel = do_spectrogram_diffusion(self.diffusion, diffuser, codes, voice_samples, temperature=diffusion_temperature)
230
  wav = self.vocoder.inference(mel)
231
  wav_candidates.append(wav.cpu())
232
  self.diffusion = self.diffusion.cpu()