jbetker commited on
Commit
8d03559
1 Parent(s): 8655b05

Update with downloadable model paths

Browse files
Files changed (2) hide show
  1. do_tts.py +35 -8
  2. requirements.txt +2 -1
do_tts.py CHANGED
@@ -1,10 +1,13 @@
1
  import argparse
2
  import os
3
  import random
 
4
 
5
  import torch
6
  import torch.nn.functional as F
7
  import torchaudio
 
 
8
  from models.dvae import DiscreteVAE
9
  from models.autoregressive import UnifiedVoice
10
  from tqdm import tqdm
@@ -16,6 +19,32 @@ from utils.audio import load_audio
16
  from utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
17
  from utils.tokenizer import VoiceBpeTokenizer
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200):
21
  """
@@ -103,10 +132,6 @@ if __name__ == '__main__':
103
  }
104
 
105
  parser = argparse.ArgumentParser()
106
- parser.add_argument('-autoregressive_model_path', type=str, help='Autoregressive model checkpoint to load.', default='.models/unified_voice.pth')
107
- parser.add_argument('-clip_model_path', type=str, help='CLIP model checkpoint to load.', default='.models/clip.pth')
108
- parser.add_argument('-diffusion_model_path', type=str, help='Diffusion model checkpoint to load.', default='.models/diffusion_vocoder.pth')
109
- parser.add_argument('-dvae_model_path', type=str, help='DVAE model checkpoint to load.', default='.models/dvae.pth')
110
  parser.add_argument('-text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
111
  parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='dotrice,harris,lescault,otto,atkins,grace,kennard,mol')
112
  parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512)
@@ -114,13 +139,15 @@ if __name__ == '__main__':
114
  parser.add_argument('-num_outputs', type=int, help='Number of outputs to produce.', default=2)
115
  parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/')
116
  args = parser.parse_args()
 
117
  os.makedirs(args.output_path, exist_ok=True)
 
118
 
119
  for voice in args.voice.split(','):
120
  print("Loading GPT TTS..")
121
  autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,
122
  heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()
123
- autoregressive.load_state_dict(torch.load(args.autoregressive_model_path))
124
  stop_mel_token = autoregressive.stop_mel_token
125
 
126
  print("Loading data..")
@@ -148,7 +175,7 @@ if __name__ == '__main__':
148
  print("Loading CLIP..")
149
  clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,
150
  num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()
151
- clip.load_state_dict(torch.load(args.clip_model_path))
152
  print("Performing CLIP filtering..")
153
  clip_results = []
154
  for batch in samples:
@@ -169,12 +196,12 @@ if __name__ == '__main__':
169
  print("Loading DVAE..")
170
  dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,
171
  record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()
172
- dvae.load_state_dict(torch.load(args.dvae_model_path))
173
  print("Loading Diffusion Model..")
174
  diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],
175
  spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,
176
  conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()
177
- diffusion.load_state_dict(torch.load(args.diffusion_model_path))
178
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)
179
 
180
  print("Performing vocoding..")
 
1
  import argparse
2
  import os
3
  import random
4
+ from urllib import request
5
 
6
  import torch
7
  import torch.nn.functional as F
8
  import torchaudio
9
+ from progressbar import progressbar
10
+
11
  from models.dvae import DiscreteVAE
12
  from models.autoregressive import UnifiedVoice
13
  from tqdm import tqdm
 
19
  from utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
20
  from utils.tokenizer import VoiceBpeTokenizer
21
 
22
+ pbar = None
23
+ def download_models():
24
+ MODELS = {
25
+ 'clip.pth': 'https://huggingface.co/jbetker/tortoise-tts-clip/resolve/main/pytorch-model.bin',
26
+ 'dvae.pth': 'https://huggingface.co/jbetker/voice-dvae/resolve/main/pytorch_model.bin',
27
+ 'diffusion.pth': 'https://huggingface.co/jbetker/tortoise-tts-diffusion-v1/resolve/main/pytorch-model.bin',
28
+ 'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-autoregressive/resolve/main/pytorch-model.bin'
29
+ }
30
+ def show_progress(block_num, block_size, total_size):
31
+ global pbar
32
+ if pbar is None:
33
+ pbar = progressbar.ProgressBar(maxval=total_size)
34
+ pbar.start()
35
+
36
+ downloaded = block_num * block_size
37
+ if downloaded < total_size:
38
+ pbar.update(downloaded)
39
+ else:
40
+ pbar.finish()
41
+ pbar = None
42
+ for model_name, url in MODELS.items():
43
+ if os.path.exists(f'.models/{model_name}'):
44
+ continue
45
+ print(f'Downloading {model_name} from {url}...')
46
+ request.urlretrieve(url, f'.models/{model_name}', show_progress)
47
+ print('Done.')
48
 
49
  def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200):
50
  """
 
132
  }
133
 
134
  parser = argparse.ArgumentParser()
 
 
 
 
135
  parser.add_argument('-text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
136
  parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='dotrice,harris,lescault,otto,atkins,grace,kennard,mol')
137
  parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512)
 
139
  parser.add_argument('-num_outputs', type=int, help='Number of outputs to produce.', default=2)
140
  parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/')
141
  args = parser.parse_args()
142
+
143
  os.makedirs(args.output_path, exist_ok=True)
144
+ download_models()
145
 
146
  for voice in args.voice.split(','):
147
  print("Loading GPT TTS..")
148
  autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,
149
  heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()
150
+ autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
151
  stop_mel_token = autoregressive.stop_mel_token
152
 
153
  print("Loading data..")
 
175
  print("Loading CLIP..")
176
  clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,
177
  num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()
178
+ clip.load_state_dict(torch.load('.models/clip.pth'))
179
  print("Performing CLIP filtering..")
180
  clip_results = []
181
  for batch in samples:
 
196
  print("Loading DVAE..")
197
  dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,
198
  record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()
199
+ dvae.load_state_dict(torch.load('.models/dvae.pth'))
200
  print("Loading Diffusion Model..")
201
  diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],
202
  spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,
203
  conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()
204
+ diffusion.load_state_dict(torch.load('.models/diffusion.pth'))
205
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)
206
 
207
  print("Performing vocoding..")
requirements.txt CHANGED
@@ -4,4 +4,5 @@ rotary_embedding_torch
4
  transformers
5
  tokenizers
6
  pyfastmp3decoder
7
- inflect
 
 
4
  transformers
5
  tokenizers
6
  pyfastmp3decoder
7
+ inflect
8
+ progressbar