Update with downloadable model paths
Browse files- do_tts.py +35 -8
- requirements.txt +2 -1
do_tts.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
import random
|
|
|
4 |
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
7 |
import torchaudio
|
|
|
|
|
8 |
from models.dvae import DiscreteVAE
|
9 |
from models.autoregressive import UnifiedVoice
|
10 |
from tqdm import tqdm
|
@@ -16,6 +19,32 @@ from utils.audio import load_audio
|
|
16 |
from utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
|
17 |
from utils.tokenizer import VoiceBpeTokenizer
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200):
|
21 |
"""
|
@@ -103,10 +132,6 @@ if __name__ == '__main__':
|
|
103 |
}
|
104 |
|
105 |
parser = argparse.ArgumentParser()
|
106 |
-
parser.add_argument('-autoregressive_model_path', type=str, help='Autoregressive model checkpoint to load.', default='.models/unified_voice.pth')
|
107 |
-
parser.add_argument('-clip_model_path', type=str, help='CLIP model checkpoint to load.', default='.models/clip.pth')
|
108 |
-
parser.add_argument('-diffusion_model_path', type=str, help='Diffusion model checkpoint to load.', default='.models/diffusion_vocoder.pth')
|
109 |
-
parser.add_argument('-dvae_model_path', type=str, help='DVAE model checkpoint to load.', default='.models/dvae.pth')
|
110 |
parser.add_argument('-text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
111 |
parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='dotrice,harris,lescault,otto,atkins,grace,kennard,mol')
|
112 |
parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512)
|
@@ -114,13 +139,15 @@ if __name__ == '__main__':
|
|
114 |
parser.add_argument('-num_outputs', type=int, help='Number of outputs to produce.', default=2)
|
115 |
parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/')
|
116 |
args = parser.parse_args()
|
|
|
117 |
os.makedirs(args.output_path, exist_ok=True)
|
|
|
118 |
|
119 |
for voice in args.voice.split(','):
|
120 |
print("Loading GPT TTS..")
|
121 |
autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,
|
122 |
heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()
|
123 |
-
autoregressive.load_state_dict(torch.load(
|
124 |
stop_mel_token = autoregressive.stop_mel_token
|
125 |
|
126 |
print("Loading data..")
|
@@ -148,7 +175,7 @@ if __name__ == '__main__':
|
|
148 |
print("Loading CLIP..")
|
149 |
clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,
|
150 |
num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()
|
151 |
-
clip.load_state_dict(torch.load(
|
152 |
print("Performing CLIP filtering..")
|
153 |
clip_results = []
|
154 |
for batch in samples:
|
@@ -169,12 +196,12 @@ if __name__ == '__main__':
|
|
169 |
print("Loading DVAE..")
|
170 |
dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,
|
171 |
record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()
|
172 |
-
dvae.load_state_dict(torch.load(
|
173 |
print("Loading Diffusion Model..")
|
174 |
diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],
|
175 |
spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,
|
176 |
conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()
|
177 |
-
diffusion.load_state_dict(torch.load(
|
178 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)
|
179 |
|
180 |
print("Performing vocoding..")
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
import random
|
4 |
+
from urllib import request
|
5 |
|
6 |
import torch
|
7 |
import torch.nn.functional as F
|
8 |
import torchaudio
|
9 |
+
from progressbar import progressbar
|
10 |
+
|
11 |
from models.dvae import DiscreteVAE
|
12 |
from models.autoregressive import UnifiedVoice
|
13 |
from tqdm import tqdm
|
|
|
19 |
from utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
|
20 |
from utils.tokenizer import VoiceBpeTokenizer
|
21 |
|
22 |
+
pbar = None
|
23 |
+
def download_models():
|
24 |
+
MODELS = {
|
25 |
+
'clip.pth': 'https://huggingface.co/jbetker/tortoise-tts-clip/resolve/main/pytorch-model.bin',
|
26 |
+
'dvae.pth': 'https://huggingface.co/jbetker/voice-dvae/resolve/main/pytorch_model.bin',
|
27 |
+
'diffusion.pth': 'https://huggingface.co/jbetker/tortoise-tts-diffusion-v1/resolve/main/pytorch-model.bin',
|
28 |
+
'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-autoregressive/resolve/main/pytorch-model.bin'
|
29 |
+
}
|
30 |
+
def show_progress(block_num, block_size, total_size):
|
31 |
+
global pbar
|
32 |
+
if pbar is None:
|
33 |
+
pbar = progressbar.ProgressBar(maxval=total_size)
|
34 |
+
pbar.start()
|
35 |
+
|
36 |
+
downloaded = block_num * block_size
|
37 |
+
if downloaded < total_size:
|
38 |
+
pbar.update(downloaded)
|
39 |
+
else:
|
40 |
+
pbar.finish()
|
41 |
+
pbar = None
|
42 |
+
for model_name, url in MODELS.items():
|
43 |
+
if os.path.exists(f'.models/{model_name}'):
|
44 |
+
continue
|
45 |
+
print(f'Downloading {model_name} from {url}...')
|
46 |
+
request.urlretrieve(url, f'.models/{model_name}', show_progress)
|
47 |
+
print('Done.')
|
48 |
|
49 |
def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200):
|
50 |
"""
|
|
|
132 |
}
|
133 |
|
134 |
parser = argparse.ArgumentParser()
|
|
|
|
|
|
|
|
|
135 |
parser.add_argument('-text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
136 |
parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='dotrice,harris,lescault,otto,atkins,grace,kennard,mol')
|
137 |
parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512)
|
|
|
139 |
parser.add_argument('-num_outputs', type=int, help='Number of outputs to produce.', default=2)
|
140 |
parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/')
|
141 |
args = parser.parse_args()
|
142 |
+
|
143 |
os.makedirs(args.output_path, exist_ok=True)
|
144 |
+
download_models()
|
145 |
|
146 |
for voice in args.voice.split(','):
|
147 |
print("Loading GPT TTS..")
|
148 |
autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,
|
149 |
heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()
|
150 |
+
autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
151 |
stop_mel_token = autoregressive.stop_mel_token
|
152 |
|
153 |
print("Loading data..")
|
|
|
175 |
print("Loading CLIP..")
|
176 |
clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,
|
177 |
num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()
|
178 |
+
clip.load_state_dict(torch.load('.models/clip.pth'))
|
179 |
print("Performing CLIP filtering..")
|
180 |
clip_results = []
|
181 |
for batch in samples:
|
|
|
196 |
print("Loading DVAE..")
|
197 |
dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,
|
198 |
record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()
|
199 |
+
dvae.load_state_dict(torch.load('.models/dvae.pth'))
|
200 |
print("Loading Diffusion Model..")
|
201 |
diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],
|
202 |
spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,
|
203 |
conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()
|
204 |
+
diffusion.load_state_dict(torch.load('.models/diffusion.pth'))
|
205 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)
|
206 |
|
207 |
print("Performing vocoding..")
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ rotary_embedding_torch
|
|
4 |
transformers
|
5 |
tokenizers
|
6 |
pyfastmp3decoder
|
7 |
-
inflect
|
|
|
|
4 |
transformers
|
5 |
tokenizers
|
6 |
pyfastmp3decoder
|
7 |
+
inflect
|
8 |
+
progressbar
|