update do_tts
Browse files
api.py
CHANGED
@@ -157,10 +157,23 @@ class TextToSpeech:
|
|
157 |
|
158 |
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
159 |
model_dim=1024,
|
160 |
-
heads=16, number_text_tokens=
|
161 |
train_solo_embeddings=False,
|
162 |
average_conditioning_embeddings=True).cpu().eval()
|
163 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
self.clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
166 |
text_seq_len=350, text_heads=8,
|
@@ -202,7 +215,7 @@ class TextToSpeech:
|
|
202 |
|
203 |
def tts(self, text, voice_samples, k=1,
|
204 |
# autoregressive generation parameters follow
|
205 |
-
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8,
|
206 |
# diffusion generation parameters follow
|
207 |
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
|
208 |
**hf_generate_kwargs):
|
@@ -232,8 +245,9 @@ class TextToSpeech:
|
|
232 |
num_return_sequences=self.autoregressive_batch_size,
|
233 |
length_penalty=length_penalty,
|
234 |
repetition_penalty=repetition_penalty,
|
|
|
235 |
**hf_generate_kwargs)
|
236 |
-
padding_needed =
|
237 |
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
238 |
samples.append(codes)
|
239 |
self.autoregressive = self.autoregressive.cpu()
|
@@ -253,11 +267,11 @@ class TextToSpeech:
|
|
253 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
254 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
255 |
# results, but will increase memory usage.
|
256 |
-
self.
|
257 |
-
best_latents = self.
|
258 |
-
torch.tensor([best_results.shape[-1]*self.
|
259 |
return_latent=True, clip_inputs=False)
|
260 |
-
self.
|
261 |
|
262 |
print("Performing vocoding..")
|
263 |
wav_candidates = []
|
|
|
157 |
|
158 |
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
159 |
model_dim=1024,
|
160 |
+
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
161 |
train_solo_embeddings=False,
|
162 |
average_conditioning_embeddings=True).cpu().eval()
|
163 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
164 |
+
'''
|
165 |
+
self.autoregressive = UnifiedVoice(max_mel_tokens=2048, max_text_tokens=1024, max_conditioning_inputs=1, layers=42,
|
166 |
+
model_dim=1152, heads=18, number_text_tokens=256, train_solo_embeddings=False,
|
167 |
+
average_conditioning_embeddings=True, types=2).cpu().eval()
|
168 |
+
self.autoregressive.load_state_dict(torch.load('X:\\dlas\\experiments\\train_gpt_tts_xl\\models\\15250_gpt_ema.pth'))
|
169 |
+
'''
|
170 |
+
|
171 |
+
self.autoregressive_for_diffusion = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
172 |
+
model_dim=1024,
|
173 |
+
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
174 |
+
train_solo_embeddings=False,
|
175 |
+
average_conditioning_embeddings=True).cpu().eval()
|
176 |
+
self.autoregressive_for_diffusion.load_state_dict(torch.load('.models/autoregressive.pth'))
|
177 |
|
178 |
self.clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
179 |
text_seq_len=350, text_heads=8,
|
|
|
215 |
|
216 |
def tts(self, text, voice_samples, k=1,
|
217 |
# autoregressive generation parameters follow
|
218 |
+
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
|
219 |
# diffusion generation parameters follow
|
220 |
diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
|
221 |
**hf_generate_kwargs):
|
|
|
245 |
num_return_sequences=self.autoregressive_batch_size,
|
246 |
length_penalty=length_penalty,
|
247 |
repetition_penalty=repetition_penalty,
|
248 |
+
max_generate_length=max_mel_tokens,
|
249 |
**hf_generate_kwargs)
|
250 |
+
padding_needed = max_mel_tokens - codes.shape[1]
|
251 |
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
252 |
samples.append(codes)
|
253 |
self.autoregressive = self.autoregressive.cpu()
|
|
|
267 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
268 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
269 |
# results, but will increase memory usage.
|
270 |
+
self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cuda()
|
271 |
+
best_latents = self.autoregressive_for_diffusion(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
|
272 |
+
torch.tensor([best_results.shape[-1]*self.autoregressive_for_diffusion.mel_length_compression], device=conds.device),
|
273 |
return_latent=True, clip_inputs=False)
|
274 |
+
self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cpu()
|
275 |
|
276 |
print("Performing vocoding..")
|
277 |
wav_candidates = []
|
do_tts.py
CHANGED
@@ -1,35 +1,17 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
|
4 |
-
import torch
|
5 |
-
import torch.nn.functional as F
|
6 |
import torchaudio
|
7 |
|
8 |
-
from api import TextToSpeech
|
9 |
-
from utils.audio import load_audio
|
10 |
-
from utils.tokenizer import VoiceBpeTokenizer
|
11 |
|
12 |
if __name__ == '__main__':
|
13 |
-
# These are voices drawn randomly from the training set. You are free to substitute your own voices in, but testing
|
14 |
-
# has shown that the model does not generalize to new voices very well.
|
15 |
-
preselected_cond_voices = {
|
16 |
-
# Male voices
|
17 |
-
'dotrice': ['voices/dotrice/1.wav', 'voices/dotrice/2.wav'],
|
18 |
-
'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],
|
19 |
-
'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],
|
20 |
-
'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],
|
21 |
-
'obama': ['voices/obama/1.wav', 'voices/obama/2.wav'],
|
22 |
-
# Female voices
|
23 |
-
'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],
|
24 |
-
'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],
|
25 |
-
'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],
|
26 |
-
'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],
|
27 |
-
}
|
28 |
-
|
29 |
parser = argparse.ArgumentParser()
|
30 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
31 |
-
parser.add_argument('--voice', type=str, help='
|
32 |
-
|
|
|
33 |
parser.add_argument('--batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
|
34 |
parser.add_argument('--num_diffusion_samples', type=int, help='Number of outputs that progress to the diffusion stage.', default=16)
|
35 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
@@ -38,8 +20,10 @@ if __name__ == '__main__':
|
|
38 |
|
39 |
tts = TextToSpeech(autoregressive_batch_size=args.batch_size)
|
40 |
|
41 |
-
|
42 |
-
|
|
|
|
|
43 |
conds = []
|
44 |
for cond_path in cond_paths:
|
45 |
c = load_audio(cond_path, 22050)
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
|
|
|
|
|
4 |
import torchaudio
|
5 |
|
6 |
+
from api import TextToSpeech
|
7 |
+
from utils.audio import load_audio, get_voices
|
|
|
8 |
|
9 |
if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
parser = argparse.ArgumentParser()
|
11 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
12 |
+
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
13 |
+
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
14 |
+
parser.add_argument('--num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=256)
|
15 |
parser.add_argument('--batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
|
16 |
parser.add_argument('--num_diffusion_samples', type=int, help='Number of outputs that progress to the diffusion stage.', default=16)
|
17 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
|
|
20 |
|
21 |
tts = TextToSpeech(autoregressive_batch_size=args.batch_size)
|
22 |
|
23 |
+
voices = get_voices()
|
24 |
+
selected_voices = args.voice.split(',')
|
25 |
+
for voice in selected_voices:
|
26 |
+
cond_paths = voices[voice]
|
27 |
conds = []
|
28 |
for cond_path in cond_paths:
|
29 |
c = load_audio(cond_path, 22050)
|