Support totally random voices (and make fixes to previous changes)
Browse files- .gitignore +2 -1
- tortoise/api.py +50 -17
- tortoise/do_tts.py +6 -6
- tortoise/models/autoregressive.py +5 -5
- tortoise/models/diffusion_decoder.py +2 -3
- tortoise/models/random_latent_generator.py +55 -0
- tortoise/read.py +2 -2
- tortoise/utils/audio.py +3 -0
.gitignore
CHANGED
@@ -129,6 +129,7 @@ dmypy.json
|
|
129 |
.pyre/
|
130 |
|
131 |
.idea/*
|
132 |
-
|
|
|
133 |
.custom/*
|
134 |
results/*
|
|
|
129 |
.pyre/
|
130 |
|
131 |
.idea/*
|
132 |
+
tortoise/.models/*
|
133 |
+
tortoise/random_voices/*
|
134 |
.custom/*
|
135 |
results/*
|
tortoise/api.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import random
|
|
|
3 |
from urllib import request
|
4 |
|
5 |
import torch
|
@@ -15,6 +16,7 @@ from tqdm import tqdm
|
|
15 |
|
16 |
from tortoise.models.arch_util import TorchMelSpectrogram
|
17 |
from tortoise.models.clvp import CLVP
|
|
|
18 |
from tortoise.models.vocoder import UnivNetGenerator
|
19 |
from tortoise.utils.audio import wav_to_univnet_mel, denormalize_tacotron_mel
|
20 |
from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
|
@@ -161,7 +163,8 @@ class TextToSpeech:
|
|
161 |
Main entry point into Tortoise.
|
162 |
"""
|
163 |
|
164 |
-
def __init__(self, autoregressive_batch_size=16, models_dir='.models', enable_redaction=True
|
|
|
165 |
"""
|
166 |
Constructor
|
167 |
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
@@ -170,11 +173,15 @@ class TextToSpeech:
|
|
170 |
models, otherwise use the defaults.
|
171 |
:param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output
|
172 |
(but are still rendered by the model). This can be used for prompt engineering.
|
|
|
|
|
|
|
173 |
"""
|
174 |
self.autoregressive_batch_size = autoregressive_batch_size
|
175 |
self.enable_redaction = enable_redaction
|
176 |
if self.enable_redaction:
|
177 |
self.aligner = Wav2VecAlignment()
|
|
|
178 |
|
179 |
self.tokenizer = VoiceBpeTokenizer()
|
180 |
download_models()
|
@@ -210,6 +217,10 @@ class TextToSpeech:
|
|
210 |
self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g'])
|
211 |
self.vocoder.eval(inference=True)
|
212 |
|
|
|
|
|
|
|
|
|
213 |
def tts_with_preset(self, text, preset='fast', **kwargs):
|
214 |
"""
|
215 |
Calls TTS with one of a set of preset generation parameters. Options:
|
@@ -265,7 +276,21 @@ class TextToSpeech:
|
|
265 |
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
266 |
self.diffusion = self.diffusion.cpu()
|
267 |
|
268 |
-
return auto_latent, diffusion_latent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
|
271 |
# autoregressive generation parameters follow
|
@@ -323,14 +348,19 @@ class TextToSpeech:
|
|
323 |
:return: Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
|
324 |
Sample rate is 24kHz.
|
325 |
"""
|
326 |
-
|
327 |
-
|
328 |
-
assert
|
329 |
|
|
|
330 |
if voice_samples is not None:
|
331 |
-
auto_conditioning, diffusion_conditioning = self.get_conditioning_latents(voice_samples)
|
332 |
-
|
333 |
auto_conditioning, diffusion_conditioning = conditioning_latents
|
|
|
|
|
|
|
|
|
334 |
|
335 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
336 |
|
@@ -343,7 +373,7 @@ class TextToSpeech:
|
|
343 |
if verbose:
|
344 |
print("Generating autoregressive samples..")
|
345 |
for b in tqdm(range(num_batches), disable=not verbose):
|
346 |
-
codes = self.autoregressive.inference_speech(auto_conditioning,
|
347 |
do_sample=True,
|
348 |
top_p=top_p,
|
349 |
temperature=temperature,
|
@@ -365,12 +395,15 @@ class TextToSpeech:
|
|
365 |
for batch in tqdm(samples, disable=not verbose):
|
366 |
for i in range(batch.shape[0]):
|
367 |
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
368 |
-
clvp = self.clvp(
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
|
|
|
|
374 |
clip_results = torch.cat(clip_results, dim=0)
|
375 |
samples = torch.cat(samples, dim=0)
|
376 |
best_results = samples[torch.topk(clip_results, k=k).indices]
|
@@ -382,8 +415,8 @@ class TextToSpeech:
|
|
382 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
383 |
# results, but will increase memory usage.
|
384 |
self.autoregressive = self.autoregressive.cuda()
|
385 |
-
best_latents = self.autoregressive(auto_conditioning,
|
386 |
-
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=
|
387 |
return_latent=True, clip_inputs=False)
|
388 |
self.autoregressive = self.autoregressive.cpu()
|
389 |
del auto_conditioning
|
@@ -415,7 +448,7 @@ class TextToSpeech:
|
|
415 |
self.diffusion = self.diffusion.cpu()
|
416 |
self.vocoder = self.vocoder.cpu()
|
417 |
|
418 |
-
def potentially_redact(
|
419 |
if self.enable_redaction:
|
420 |
return self.aligner.redact(clip, text)
|
421 |
return clip
|
|
|
1 |
import os
|
2 |
import random
|
3 |
+
import uuid
|
4 |
from urllib import request
|
5 |
|
6 |
import torch
|
|
|
16 |
|
17 |
from tortoise.models.arch_util import TorchMelSpectrogram
|
18 |
from tortoise.models.clvp import CLVP
|
19 |
+
from tortoise.models.random_latent_generator import RandomLatentConverter
|
20 |
from tortoise.models.vocoder import UnivNetGenerator
|
21 |
from tortoise.utils.audio import wav_to_univnet_mel, denormalize_tacotron_mel
|
22 |
from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
|
|
|
163 |
Main entry point into Tortoise.
|
164 |
"""
|
165 |
|
166 |
+
def __init__(self, autoregressive_batch_size=16, models_dir='.models', enable_redaction=True,
|
167 |
+
save_random_voices=False):
|
168 |
"""
|
169 |
Constructor
|
170 |
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
|
|
173 |
models, otherwise use the defaults.
|
174 |
:param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output
|
175 |
(but are still rendered by the model). This can be used for prompt engineering.
|
176 |
+
Default is true.
|
177 |
+
:param save_random_voices: When true, voices that are randomly generated are saved to the `random_voices`
|
178 |
+
directory. Default is false.
|
179 |
"""
|
180 |
self.autoregressive_batch_size = autoregressive_batch_size
|
181 |
self.enable_redaction = enable_redaction
|
182 |
if self.enable_redaction:
|
183 |
self.aligner = Wav2VecAlignment()
|
184 |
+
self.save_random_voices = save_random_voices
|
185 |
|
186 |
self.tokenizer = VoiceBpeTokenizer()
|
187 |
download_models()
|
|
|
217 |
self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g'])
|
218 |
self.vocoder.eval(inference=True)
|
219 |
|
220 |
+
# Random latent generators (RLGs) are loaded lazily.
|
221 |
+
self.rlg_auto = None
|
222 |
+
self.rlg_diffusion = None
|
223 |
+
|
224 |
def tts_with_preset(self, text, preset='fast', **kwargs):
|
225 |
"""
|
226 |
Calls TTS with one of a set of preset generation parameters. Options:
|
|
|
276 |
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
277 |
self.diffusion = self.diffusion.cpu()
|
278 |
|
279 |
+
return auto_latent, diffusion_latent, auto_conds
|
280 |
+
|
281 |
+
def get_random_conditioning_latents(self):
|
282 |
+
# Lazy-load the RLG models.
|
283 |
+
if self.rlg_auto is None:
|
284 |
+
self.rlg_auto = RandomLatentConverter(1024).eval()
|
285 |
+
self.rlg_auto.load_state_dict(torch.load('.models/rlg_auto.pth', map_location=torch.device('cpu')))
|
286 |
+
self.rlg_diffusion = RandomLatentConverter(2048).eval()
|
287 |
+
self.rlg_diffusion.load_state_dict(torch.load('.models/rlg_diffuser.pth', map_location=torch.device('cpu')))
|
288 |
+
with torch.no_grad():
|
289 |
+
latents = self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))
|
290 |
+
if self.save_random_voices:
|
291 |
+
os.makedirs('random_voices', exist_ok=True)
|
292 |
+
torch.save(latents, f'random_voices/{str(uuid.uuid4())}.pth')
|
293 |
+
return latents
|
294 |
|
295 |
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
|
296 |
# autoregressive generation parameters follow
|
|
|
348 |
:return: Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
|
349 |
Sample rate is 24kHz.
|
350 |
"""
|
351 |
+
text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).cuda()
|
352 |
+
text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
|
353 |
+
assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
|
354 |
|
355 |
+
auto_conds = None
|
356 |
if voice_samples is not None:
|
357 |
+
auto_conditioning, diffusion_conditioning, auto_conds = self.get_conditioning_latents(voice_samples)
|
358 |
+
elif conditioning_latents is not None:
|
359 |
auto_conditioning, diffusion_conditioning = conditioning_latents
|
360 |
+
else:
|
361 |
+
auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
|
362 |
+
auto_conditioning = auto_conditioning.cuda()
|
363 |
+
diffusion_conditioning = diffusion_conditioning.cuda()
|
364 |
|
365 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
366 |
|
|
|
373 |
if verbose:
|
374 |
print("Generating autoregressive samples..")
|
375 |
for b in tqdm(range(num_batches), disable=not verbose):
|
376 |
+
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
377 |
do_sample=True,
|
378 |
top_p=top_p,
|
379 |
temperature=temperature,
|
|
|
395 |
for batch in tqdm(samples, disable=not verbose):
|
396 |
for i in range(batch.shape[0]):
|
397 |
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
398 |
+
clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
|
399 |
+
if auto_conds is not None:
|
400 |
+
cvvp_accumulator = 0
|
401 |
+
for cl in range(auto_conds.shape[1]):
|
402 |
+
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
|
403 |
+
cvvp = cvvp_accumulator / auto_conds.shape[1]
|
404 |
+
clip_results.append(clvp * clvp_cvvp_slider + cvvp * (1-clvp_cvvp_slider))
|
405 |
+
else:
|
406 |
+
clip_results.append(clvp)
|
407 |
clip_results = torch.cat(clip_results, dim=0)
|
408 |
samples = torch.cat(samples, dim=0)
|
409 |
best_results = samples[torch.topk(clip_results, k=k).indices]
|
|
|
415 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
416 |
# results, but will increase memory usage.
|
417 |
self.autoregressive = self.autoregressive.cuda()
|
418 |
+
best_latents = self.autoregressive(auto_conditioning, text_tokens, torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
|
419 |
+
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
420 |
return_latent=True, clip_inputs=False)
|
421 |
self.autoregressive = self.autoregressive.cpu()
|
422 |
del auto_conditioning
|
|
|
448 |
self.diffusion = self.diffusion.cpu()
|
449 |
self.vocoder = self.vocoder.cpu()
|
450 |
|
451 |
+
def potentially_redact(clip, text):
|
452 |
if self.enable_redaction:
|
453 |
return self.aligner.redact(clip, text)
|
454 |
return clip
|
tortoise/do_tts.py
CHANGED
@@ -10,23 +10,23 @@ if __name__ == '__main__':
|
|
10 |
parser = argparse.ArgumentParser()
|
11 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
12 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
13 |
-
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='
|
14 |
-
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='
|
15 |
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
16 |
help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
|
17 |
default=.5)
|
18 |
-
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
19 |
parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
|
20 |
'should only be specified if you have custom checkpoints.', default='.models')
|
21 |
args = parser.parse_args()
|
22 |
os.makedirs(args.output_path, exist_ok=True)
|
23 |
|
24 |
-
tts = TextToSpeech(models_dir=args.model_dir)
|
25 |
|
26 |
selected_voices = args.voice.split(',')
|
27 |
-
for voice in selected_voices:
|
28 |
voice_samples, conditioning_latents = load_voice(voice)
|
29 |
gen = tts.tts_with_preset(args.text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
|
30 |
preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
31 |
-
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
|
32 |
|
|
|
10 |
parser = argparse.ArgumentParser()
|
11 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
12 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
13 |
+
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
|
14 |
+
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast')
|
15 |
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
16 |
help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
|
17 |
default=.5)
|
18 |
+
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/')
|
19 |
parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
|
20 |
'should only be specified if you have custom checkpoints.', default='.models')
|
21 |
args = parser.parse_args()
|
22 |
os.makedirs(args.output_path, exist_ok=True)
|
23 |
|
24 |
+
tts = TextToSpeech(models_dir=args.model_dir, save_random_voices=True)
|
25 |
|
26 |
selected_voices = args.voice.split(',')
|
27 |
+
for k, voice in enumerate(selected_voices):
|
28 |
voice_samples, conditioning_latents = load_voice(voice)
|
29 |
gen = tts.tts_with_preset(args.text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
|
30 |
preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
31 |
+
torchaudio.save(os.path.join(args.output_path, f'{voice}_{k}.wav'), gen.squeeze(0).cpu(), 24000)
|
32 |
|
tortoise/models/autoregressive.py
CHANGED
@@ -401,13 +401,13 @@ class UnifiedVoice(nn.Module):
|
|
401 |
conds = conds.mean(dim=1).unsqueeze(1)
|
402 |
return conds
|
403 |
|
404 |
-
def forward(self,
|
405 |
return_latent=False, clip_inputs=True):
|
406 |
"""
|
407 |
Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
|
408 |
(actuated by `text_first`).
|
409 |
|
410 |
-
speech_conditioning_input: MEL float tensor, (b,
|
411 |
text_inputs: long tensor, (b,t)
|
412 |
text_lengths: long tensor, (b,)
|
413 |
mel_inputs: long tensor, (b,m)
|
@@ -421,7 +421,7 @@ class UnifiedVoice(nn.Module):
|
|
421 |
# Types are expressed by expanding the text embedding space.
|
422 |
if types is not None:
|
423 |
text_inputs = text_inputs * (1+types).unsqueeze(-1)
|
424 |
-
|
425 |
if clip_inputs:
|
426 |
# This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
|
427 |
# chopping the inputs by the maximum actual length.
|
@@ -435,7 +435,7 @@ class UnifiedVoice(nn.Module):
|
|
435 |
text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token)
|
436 |
mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token)
|
437 |
|
438 |
-
conds =
|
439 |
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
|
440 |
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
|
441 |
mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
|
@@ -540,7 +540,7 @@ class UnifiedVoice(nn.Module):
|
|
540 |
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
|
541 |
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
|
542 |
|
543 |
-
conds = speech_conditioning_latent
|
544 |
emb = torch.cat([conds, text_emb], dim=1)
|
545 |
self.inference_model.store_mel_emb(emb)
|
546 |
|
|
|
401 |
conds = conds.mean(dim=1).unsqueeze(1)
|
402 |
return conds
|
403 |
|
404 |
+
def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
|
405 |
return_latent=False, clip_inputs=True):
|
406 |
"""
|
407 |
Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
|
408 |
(actuated by `text_first`).
|
409 |
|
410 |
+
speech_conditioning_input: MEL float tensor, (b,1024)
|
411 |
text_inputs: long tensor, (b,t)
|
412 |
text_lengths: long tensor, (b,)
|
413 |
mel_inputs: long tensor, (b,m)
|
|
|
421 |
# Types are expressed by expanding the text embedding space.
|
422 |
if types is not None:
|
423 |
text_inputs = text_inputs * (1+types).unsqueeze(-1)
|
424 |
+
|
425 |
if clip_inputs:
|
426 |
# This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
|
427 |
# chopping the inputs by the maximum actual length.
|
|
|
435 |
text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token)
|
436 |
mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token)
|
437 |
|
438 |
+
conds = speech_conditioning_latent.unsqueeze(1)
|
439 |
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
|
440 |
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
|
441 |
mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
|
|
|
540 |
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
|
541 |
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
|
542 |
|
543 |
+
conds = speech_conditioning_latent.unsqueeze(1)
|
544 |
emb = torch.cat([conds, text_emb], dim=1)
|
545 |
self.inference_model.store_mel_emb(emb)
|
546 |
|
tortoise/models/diffusion_decoder.py
CHANGED
@@ -226,6 +226,7 @@ class DiffusionTts(nn.Module):
|
|
226 |
for j in range(speech_conditioning_input.shape[1]):
|
227 |
conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
|
228 |
conds = torch.cat(conds, dim=-1)
|
|
|
229 |
return conds
|
230 |
|
231 |
def timestep_independent(self, aligned_conditioning, conditioning_latent, expected_seq_len, return_code_pred):
|
@@ -233,9 +234,7 @@ class DiffusionTts(nn.Module):
|
|
233 |
if is_latent(aligned_conditioning):
|
234 |
aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
|
235 |
|
236 |
-
|
237 |
-
cond_emb = conds.mean(dim=-1)
|
238 |
-
cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1)
|
239 |
if is_latent(aligned_conditioning):
|
240 |
code_emb = self.latent_conditioner(aligned_conditioning)
|
241 |
else:
|
|
|
226 |
for j in range(speech_conditioning_input.shape[1]):
|
227 |
conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
|
228 |
conds = torch.cat(conds, dim=-1)
|
229 |
+
conds = conds.mean(dim=-1)
|
230 |
return conds
|
231 |
|
232 |
def timestep_independent(self, aligned_conditioning, conditioning_latent, expected_seq_len, return_code_pred):
|
|
|
234 |
if is_latent(aligned_conditioning):
|
235 |
aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
|
236 |
|
237 |
+
cond_scale, cond_shift = torch.chunk(conditioning_latent, 2, dim=1)
|
|
|
|
|
238 |
if is_latent(aligned_conditioning):
|
239 |
code_emb = self.latent_conditioner(aligned_conditioning)
|
240 |
else:
|
tortoise/models/random_latent_generator.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.nn.functional as F
|
6 |
+
|
7 |
+
|
8 |
+
def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5):
|
9 |
+
if bias is not None:
|
10 |
+
rest_dim = [1] * (input.ndim - bias.ndim - 1)
|
11 |
+
return (
|
12 |
+
F.leaky_relu(
|
13 |
+
input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=negative_slope
|
14 |
+
)
|
15 |
+
* scale
|
16 |
+
)
|
17 |
+
else:
|
18 |
+
return F.leaky_relu(input, negative_slope=0.2) * scale
|
19 |
+
|
20 |
+
|
21 |
+
class EqualLinear(nn.Module):
|
22 |
+
def __init__(
|
23 |
+
self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1
|
24 |
+
):
|
25 |
+
super().__init__()
|
26 |
+
self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
|
27 |
+
if bias:
|
28 |
+
self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
|
29 |
+
else:
|
30 |
+
self.bias = None
|
31 |
+
self.scale = (1 / math.sqrt(in_dim)) * lr_mul
|
32 |
+
self.lr_mul = lr_mul
|
33 |
+
|
34 |
+
def forward(self, input):
|
35 |
+
out = F.linear(input, self.weight * self.scale)
|
36 |
+
out = fused_leaky_relu(out, self.bias * self.lr_mul)
|
37 |
+
return out
|
38 |
+
|
39 |
+
|
40 |
+
class RandomLatentConverter(nn.Module):
|
41 |
+
def __init__(self, channels):
|
42 |
+
super().__init__()
|
43 |
+
self.layers = nn.Sequential(*[EqualLinear(channels, channels, lr_mul=.1) for _ in range(5)],
|
44 |
+
nn.Linear(channels, channels))
|
45 |
+
self.channels = channels
|
46 |
+
|
47 |
+
def forward(self, ref):
|
48 |
+
r = torch.randn(ref.shape[0], self.channels, device=ref.device)
|
49 |
+
y = self.layers(r)
|
50 |
+
return y
|
51 |
+
|
52 |
+
|
53 |
+
if __name__ == '__main__':
|
54 |
+
model = RandomLatentConverter(512)
|
55 |
+
model(torch.randn(5,512))
|
tortoise/read.py
CHANGED
@@ -31,7 +31,7 @@ if __name__ == '__main__':
|
|
31 |
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
|
32 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
33 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat')
|
34 |
-
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
35 |
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
36 |
parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None)
|
37 |
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
@@ -40,7 +40,7 @@ if __name__ == '__main__':
|
|
40 |
parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
|
41 |
'should only be specified if you have custom checkpoints.', default='.models')
|
42 |
args = parser.parse_args()
|
43 |
-
tts = TextToSpeech(models_dir=args.model_dir)
|
44 |
|
45 |
outpath = args.output_path
|
46 |
selected_voices = args.voice.split(',')
|
|
|
31 |
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
|
32 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
33 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat')
|
34 |
+
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/longform/')
|
35 |
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
36 |
parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None)
|
37 |
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
|
|
40 |
parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
|
41 |
'should only be specified if you have custom checkpoints.', default='.models')
|
42 |
args = parser.parse_args()
|
43 |
+
tts = TextToSpeech(models_dir=args.model_dir, save_random_voices=True)
|
44 |
|
45 |
outpath = args.output_path
|
46 |
selected_voices = args.voice.split(',')
|
tortoise/utils/audio.py
CHANGED
@@ -92,6 +92,9 @@ def get_voices():
|
|
92 |
|
93 |
|
94 |
def load_voice(voice):
|
|
|
|
|
|
|
95 |
voices = get_voices()
|
96 |
paths = voices[voice]
|
97 |
if len(paths) == 1 and paths[0].endswith('.pth'):
|
|
|
92 |
|
93 |
|
94 |
def load_voice(voice):
|
95 |
+
if voice == 'random':
|
96 |
+
return None, None
|
97 |
+
|
98 |
voices = get_voices()
|
99 |
paths = voices[voice]
|
100 |
if len(paths) == 1 and paths[0].endswith('.pth'):
|