indic / TTS /vocoder /models /wavernn.py
azamat's picture
Init
6127b48
raw
history blame
24.8 kB
import sys
import time
from dataclasses import dataclass, field
from typing import Dict, List, Tuple
import numpy as np
import torch
import torch.nn.functional as F
from coqpit import Coqpit
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from TTS.tts.utils.visual import plot_spectrogram
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_fsspec
from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
from TTS.vocoder.layers.losses import WaveRNNLoss
from TTS.vocoder.models.base_vocoder import BaseVocoder
from TTS.vocoder.utils.distribution import sample_from_discretized_mix_logistic, sample_from_gaussian
def stream(string, variables):
sys.stdout.write(f"\r{string}" % variables)
# pylint: disable=abstract-method
# relates https://github.com/pytorch/pytorch/issues/42305
class ResBlock(nn.Module):
def __init__(self, dims):
super().__init__()
self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
self.batch_norm1 = nn.BatchNorm1d(dims)
self.batch_norm2 = nn.BatchNorm1d(dims)
def forward(self, x):
residual = x
x = self.conv1(x)
x = self.batch_norm1(x)
x = F.relu(x)
x = self.conv2(x)
x = self.batch_norm2(x)
return x + residual
class MelResNet(nn.Module):
def __init__(self, num_res_blocks, in_dims, compute_dims, res_out_dims, pad):
super().__init__()
k_size = pad * 2 + 1
self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False)
self.batch_norm = nn.BatchNorm1d(compute_dims)
self.layers = nn.ModuleList()
for _ in range(num_res_blocks):
self.layers.append(ResBlock(compute_dims))
self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1)
def forward(self, x):
x = self.conv_in(x)
x = self.batch_norm(x)
x = F.relu(x)
for f in self.layers:
x = f(x)
x = self.conv_out(x)
return x
class Stretch2d(nn.Module):
def __init__(self, x_scale, y_scale):
super().__init__()
self.x_scale = x_scale
self.y_scale = y_scale
def forward(self, x):
b, c, h, w = x.size()
x = x.unsqueeze(-1).unsqueeze(3)
x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
return x.view(b, c, h * self.y_scale, w * self.x_scale)
class UpsampleNetwork(nn.Module):
def __init__(
self,
feat_dims,
upsample_scales,
compute_dims,
num_res_blocks,
res_out_dims,
pad,
use_aux_net,
):
super().__init__()
self.total_scale = np.cumproduct(upsample_scales)[-1]
self.indent = pad * self.total_scale
self.use_aux_net = use_aux_net
if use_aux_net:
self.resnet = MelResNet(num_res_blocks, feat_dims, compute_dims, res_out_dims, pad)
self.resnet_stretch = Stretch2d(self.total_scale, 1)
self.up_layers = nn.ModuleList()
for scale in upsample_scales:
k_size = (1, scale * 2 + 1)
padding = (0, scale)
stretch = Stretch2d(scale, 1)
conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False)
conv.weight.data.fill_(1.0 / k_size[1])
self.up_layers.append(stretch)
self.up_layers.append(conv)
def forward(self, m):
if self.use_aux_net:
aux = self.resnet(m).unsqueeze(1)
aux = self.resnet_stretch(aux)
aux = aux.squeeze(1)
aux = aux.transpose(1, 2)
else:
aux = None
m = m.unsqueeze(1)
for f in self.up_layers:
m = f(m)
m = m.squeeze(1)[:, :, self.indent : -self.indent]
return m.transpose(1, 2), aux
class Upsample(nn.Module):
def __init__(self, scale, pad, num_res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net):
super().__init__()
self.scale = scale
self.pad = pad
self.indent = pad * scale
self.use_aux_net = use_aux_net
self.resnet = MelResNet(num_res_blocks, feat_dims, compute_dims, res_out_dims, pad)
def forward(self, m):
if self.use_aux_net:
aux = self.resnet(m)
aux = torch.nn.functional.interpolate(aux, scale_factor=self.scale, mode="linear", align_corners=True)
aux = aux.transpose(1, 2)
else:
aux = None
m = torch.nn.functional.interpolate(m, scale_factor=self.scale, mode="linear", align_corners=True)
m = m[:, :, self.indent : -self.indent]
m = m * 0.045 # empirically found
return m.transpose(1, 2), aux
@dataclass
class WavernnArgs(Coqpit):
"""🐸 WaveRNN model arguments.
rnn_dims (int):
Number of hidden channels in RNN layers. Defaults to 512.
fc_dims (int):
Number of hidden channels in fully-conntected layers. Defaults to 512.
compute_dims (int):
Number of hidden channels in the feature ResNet. Defaults to 128.
res_out_dim (int):
Number of hidden channels in the feature ResNet output. Defaults to 128.
num_res_blocks (int):
Number of residual blocks in the ResNet. Defaults to 10.
use_aux_net (bool):
enable/disable the feature ResNet. Defaults to True.
use_upsample_net (bool):
enable/ disable the upsampling networl. If False, basic upsampling is used. Defaults to True.
upsample_factors (list):
Upsampling factors. The multiply of the values must match the `hop_length`. Defaults to ```[4, 8, 8]```.
mode (str):
Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single
Gaussian Distribution and `bits` for quantized bits as the model's output.
mulaw (bool):
enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults
to `True`.
pad (int):
Padding applied to the input feature frames against the convolution layers of the feature network.
Defaults to 2.
"""
rnn_dims: int = 512
fc_dims: int = 512
compute_dims: int = 128
res_out_dims: int = 128
num_res_blocks: int = 10
use_aux_net: bool = True
use_upsample_net: bool = True
upsample_factors: List[int] = field(default_factory=lambda: [4, 8, 8])
mode: str = "mold" # mold [string], gauss [string], bits [int]
mulaw: bool = True # apply mulaw if mode is bits
pad: int = 2
feat_dims: int = 80
class Wavernn(BaseVocoder):
def __init__(self, config: Coqpit):
"""🐸 WaveRNN model.
Original paper - https://arxiv.org/abs/1802.08435
Official implementation - https://github.com/fatchord/WaveRNN
Args:
config (Coqpit): [description]
Raises:
RuntimeError: [description]
Examples:
>>> from TTS.vocoder.configs import WavernnConfig
>>> config = WavernnConfig()
>>> model = Wavernn(config)
Paper Abstract:
Sequential models achieve state-of-the-art results in audio, visual and textual domains with respect to
both estimating the data distribution and generating high-quality samples. Efficient sampling for this
class of models has however remained an elusive problem. With a focus on text-to-speech synthesis, we
describe a set of general techniques for reducing sampling time while maintaining high output quality.
We first describe a single-layer recurrent neural network, the WaveRNN, with a dual softmax layer that
matches the quality of the state-of-the-art WaveNet model. The compact form of the network makes it
possible to generate 24kHz 16-bit audio 4x faster than real time on a GPU. Second, we apply a weight
pruning technique to reduce the number of weights in the WaveRNN. We find that, for a constant number of
parameters, large sparse networks perform better than small dense networks and this relationship holds for
sparsity levels beyond 96%. The small number of weights in a Sparse WaveRNN makes it possible to sample
high-fidelity audio on a mobile CPU in real time. Finally, we propose a new generation scheme based on
subscaling that folds a long sequence into a batch of shorter sequences and allows one to generate multiple
samples at once. The Subscale WaveRNN produces 16 samples per step without loss of quality and offers an
orthogonal method for increasing sampling efficiency.
"""
super().__init__(config)
if isinstance(self.args.mode, int):
self.n_classes = 2**self.args.mode
elif self.args.mode == "mold":
self.n_classes = 3 * 10
elif self.args.mode == "gauss":
self.n_classes = 2
else:
raise RuntimeError("Unknown model mode value - ", self.args.mode)
self.aux_dims = self.args.res_out_dims // 4
if self.args.use_upsample_net:
assert (
np.cumproduct(self.args.upsample_factors)[-1] == config.audio.hop_length
), " [!] upsample scales needs to be equal to hop_length"
self.upsample = UpsampleNetwork(
self.args.feat_dims,
self.args.upsample_factors,
self.args.compute_dims,
self.args.num_res_blocks,
self.args.res_out_dims,
self.args.pad,
self.args.use_aux_net,
)
else:
self.upsample = Upsample(
config.audio.hop_length,
self.args.pad,
self.args.num_res_blocks,
self.args.feat_dims,
self.args.compute_dims,
self.args.res_out_dims,
self.args.use_aux_net,
)
if self.args.use_aux_net:
self.I = nn.Linear(self.args.feat_dims + self.aux_dims + 1, self.args.rnn_dims)
self.rnn1 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True)
self.rnn2 = nn.GRU(self.args.rnn_dims + self.aux_dims, self.args.rnn_dims, batch_first=True)
self.fc1 = nn.Linear(self.args.rnn_dims + self.aux_dims, self.args.fc_dims)
self.fc2 = nn.Linear(self.args.fc_dims + self.aux_dims, self.args.fc_dims)
self.fc3 = nn.Linear(self.args.fc_dims, self.n_classes)
else:
self.I = nn.Linear(self.args.feat_dims + 1, self.args.rnn_dims)
self.rnn1 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True)
self.rnn2 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True)
self.fc1 = nn.Linear(self.args.rnn_dims, self.args.fc_dims)
self.fc2 = nn.Linear(self.args.fc_dims, self.args.fc_dims)
self.fc3 = nn.Linear(self.args.fc_dims, self.n_classes)
def forward(self, x, mels):
bsize = x.size(0)
h1 = torch.zeros(1, bsize, self.args.rnn_dims).to(x.device)
h2 = torch.zeros(1, bsize, self.args.rnn_dims).to(x.device)
mels, aux = self.upsample(mels)
if self.args.use_aux_net:
aux_idx = [self.aux_dims * i for i in range(5)]
a1 = aux[:, :, aux_idx[0] : aux_idx[1]]
a2 = aux[:, :, aux_idx[1] : aux_idx[2]]
a3 = aux[:, :, aux_idx[2] : aux_idx[3]]
a4 = aux[:, :, aux_idx[3] : aux_idx[4]]
x = (
torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
if self.args.use_aux_net
else torch.cat([x.unsqueeze(-1), mels], dim=2)
)
x = self.I(x)
res = x
self.rnn1.flatten_parameters()
x, _ = self.rnn1(x, h1)
x = x + res
res = x
x = torch.cat([x, a2], dim=2) if self.args.use_aux_net else x
self.rnn2.flatten_parameters()
x, _ = self.rnn2(x, h2)
x = x + res
x = torch.cat([x, a3], dim=2) if self.args.use_aux_net else x
x = F.relu(self.fc1(x))
x = torch.cat([x, a4], dim=2) if self.args.use_aux_net else x
x = F.relu(self.fc2(x))
return self.fc3(x)
def inference(self, mels, batched=None, target=None, overlap=None):
self.eval()
output = []
start = time.time()
rnn1 = self.get_gru_cell(self.rnn1)
rnn2 = self.get_gru_cell(self.rnn2)
with torch.no_grad():
if isinstance(mels, np.ndarray):
mels = torch.FloatTensor(mels).to(str(next(self.parameters()).device))
if mels.ndim == 2:
mels = mels.unsqueeze(0)
wave_len = (mels.size(-1) - 1) * self.config.audio.hop_length
mels = self.pad_tensor(mels.transpose(1, 2), pad=self.args.pad, side="both")
mels, aux = self.upsample(mels.transpose(1, 2))
if batched:
mels = self.fold_with_overlap(mels, target, overlap)
if aux is not None:
aux = self.fold_with_overlap(aux, target, overlap)
b_size, seq_len, _ = mels.size()
h1 = torch.zeros(b_size, self.args.rnn_dims).type_as(mels)
h2 = torch.zeros(b_size, self.args.rnn_dims).type_as(mels)
x = torch.zeros(b_size, 1).type_as(mels)
if self.args.use_aux_net:
d = self.aux_dims
aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)]
for i in range(seq_len):
m_t = mels[:, i, :]
if self.args.use_aux_net:
a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
x = torch.cat([x, m_t, a1_t], dim=1) if self.args.use_aux_net else torch.cat([x, m_t], dim=1)
x = self.I(x)
h1 = rnn1(x, h1)
x = x + h1
inp = torch.cat([x, a2_t], dim=1) if self.args.use_aux_net else x
h2 = rnn2(inp, h2)
x = x + h2
x = torch.cat([x, a3_t], dim=1) if self.args.use_aux_net else x
x = F.relu(self.fc1(x))
x = torch.cat([x, a4_t], dim=1) if self.args.use_aux_net else x
x = F.relu(self.fc2(x))
logits = self.fc3(x)
if self.args.mode == "mold":
sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2))
output.append(sample.view(-1))
x = sample.transpose(0, 1).type_as(mels)
elif self.args.mode == "gauss":
sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2))
output.append(sample.view(-1))
x = sample.transpose(0, 1).type_as(mels)
elif isinstance(self.args.mode, int):
posterior = F.softmax(logits, dim=1)
distrib = torch.distributions.Categorical(posterior)
sample = 2 * distrib.sample().float() / (self.n_classes - 1.0) - 1.0
output.append(sample)
x = sample.unsqueeze(-1)
else:
raise RuntimeError("Unknown model mode value - ", self.args.mode)
if i % 100 == 0:
self.gen_display(i, seq_len, b_size, start)
output = torch.stack(output).transpose(0, 1)
output = output.cpu()
if batched:
output = output.numpy()
output = output.astype(np.float64)
output = self.xfade_and_unfold(output, target, overlap)
else:
output = output[0]
if self.args.mulaw and isinstance(self.args.mode, int):
output = AudioProcessor.mulaw_decode(output, self.args.mode)
# Fade-out at the end to avoid signal cutting out suddenly
fade_out = np.linspace(1, 0, 20 * self.config.audio.hop_length)
output = output[:wave_len]
if wave_len > len(fade_out):
output[-20 * self.config.audio.hop_length :] *= fade_out
self.train()
return output
def gen_display(self, i, seq_len, b_size, start):
gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
realtime_ratio = gen_rate * 1000 / self.config.audio.sample_rate
stream(
"%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ",
(i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio),
)
def fold_with_overlap(self, x, target, overlap):
"""Fold the tensor with overlap for quick batched inference.
Overlap will be used for crossfading in xfade_and_unfold()
Args:
x (tensor) : Upsampled conditioning features.
shape=(1, timesteps, features)
target (int) : Target timesteps for each index of batch
overlap (int) : Timesteps for both xfade and rnn warmup
Return:
(tensor) : shape=(num_folds, target + 2 * overlap, features)
Details:
x = [[h1, h2, ... hn]]
Where each h is a vector of conditioning features
Eg: target=2, overlap=1 with x.size(1)=10
folded = [[h1, h2, h3, h4],
[h4, h5, h6, h7],
[h7, h8, h9, h10]]
"""
_, total_len, features = x.size()
# Calculate variables needed
num_folds = (total_len - overlap) // (target + overlap)
extended_len = num_folds * (overlap + target) + overlap
remaining = total_len - extended_len
# Pad if some time steps poking out
if remaining != 0:
num_folds += 1
padding = target + 2 * overlap - remaining
x = self.pad_tensor(x, padding, side="after")
folded = torch.zeros(num_folds, target + 2 * overlap, features).to(x.device)
# Get the values for the folded tensor
for i in range(num_folds):
start = i * (target + overlap)
end = start + target + 2 * overlap
folded[i] = x[:, start:end, :]
return folded
@staticmethod
def get_gru_cell(gru):
gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
gru_cell.weight_hh.data = gru.weight_hh_l0.data
gru_cell.weight_ih.data = gru.weight_ih_l0.data
gru_cell.bias_hh.data = gru.bias_hh_l0.data
gru_cell.bias_ih.data = gru.bias_ih_l0.data
return gru_cell
@staticmethod
def pad_tensor(x, pad, side="both"):
# NB - this is just a quick method i need right now
# i.e., it won't generalise to other shapes/dims
b, t, c = x.size()
total = t + 2 * pad if side == "both" else t + pad
padded = torch.zeros(b, total, c).to(x.device)
if side in ("before", "both"):
padded[:, pad : pad + t, :] = x
elif side == "after":
padded[:, :t, :] = x
return padded
@staticmethod
def xfade_and_unfold(y, target, overlap):
"""Applies a crossfade and unfolds into a 1d array.
Args:
y (ndarry) : Batched sequences of audio samples
shape=(num_folds, target + 2 * overlap)
dtype=np.float64
overlap (int) : Timesteps for both xfade and rnn warmup
Return:
(ndarry) : audio samples in a 1d array
shape=(total_len)
dtype=np.float64
Details:
y = [[seq1],
[seq2],
[seq3]]
Apply a gain envelope at both ends of the sequences
y = [[seq1_in, seq1_target, seq1_out],
[seq2_in, seq2_target, seq2_out],
[seq3_in, seq3_target, seq3_out]]
Stagger and add up the groups of samples:
[seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
"""
num_folds, length = y.shape
target = length - 2 * overlap
total_len = num_folds * (target + overlap) + overlap
# Need some silence for the rnn warmup
silence_len = overlap // 2
fade_len = overlap - silence_len
silence = np.zeros((silence_len), dtype=np.float64)
# Equal power crossfade
t = np.linspace(-1, 1, fade_len, dtype=np.float64)
fade_in = np.sqrt(0.5 * (1 + t))
fade_out = np.sqrt(0.5 * (1 - t))
# Concat the silence to the fades
fade_in = np.concatenate([silence, fade_in])
fade_out = np.concatenate([fade_out, silence])
# Apply the gain to the overlap samples
y[:, :overlap] *= fade_in
y[:, -overlap:] *= fade_out
unfolded = np.zeros((total_len), dtype=np.float64)
# Loop to add up all the samples
for i in range(num_folds):
start = i * (target + overlap)
end = start + target + 2 * overlap
unfolded[start:end] += y[i]
return unfolded
def load_checkpoint(
self, config, checkpoint_path, eval=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
self.load_state_dict(state["model"])
if eval:
self.eval()
assert not self.training
def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
mels = batch["input"]
waveform = batch["waveform"]
waveform_coarse = batch["waveform_coarse"]
y_hat = self.forward(waveform, mels)
if isinstance(self.args.mode, int):
y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
else:
waveform_coarse = waveform_coarse.float()
waveform_coarse = waveform_coarse.unsqueeze(-1)
# compute losses
loss_dict = criterion(y_hat, waveform_coarse)
return {"model_output": y_hat}, loss_dict
def eval_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
return self.train_step(batch, criterion)
@torch.no_grad()
def test(
self, assets: Dict, test_loader: "DataLoader", output: Dict # pylint: disable=unused-argument
) -> Tuple[Dict, Dict]:
ap = assets["audio_processor"]
figures = {}
audios = {}
samples = test_loader.dataset.load_test_samples(1)
for idx, sample in enumerate(samples):
x = torch.FloatTensor(sample[0])
x = x.to(next(self.parameters()).device)
y_hat = self.inference(x, self.config.batched, self.config.target_samples, self.config.overlap_samples)
x_hat = ap.melspectrogram(y_hat)
figures.update(
{
f"test_{idx}/ground_truth": plot_spectrogram(x.T),
f"test_{idx}/prediction": plot_spectrogram(x_hat.T),
}
)
audios.update({f"test_{idx}/audio": y_hat})
return figures, audios
@staticmethod
def format_batch(batch: Dict) -> Dict:
waveform = batch[0]
mels = batch[1]
waveform_coarse = batch[2]
return {"input": mels, "waveform": waveform, "waveform_coarse": waveform_coarse}
def get_data_loader( # pylint: disable=no-self-use
self,
config: Coqpit,
assets: Dict,
is_eval: True,
samples: List,
verbose: bool,
num_gpus: int,
):
ap = assets["audio_processor"]
dataset = WaveRNNDataset(
ap=ap,
items=samples,
seq_len=config.seq_len,
hop_len=ap.hop_length,
pad=config.model_args.pad,
mode=config.model_args.mode,
mulaw=config.model_args.mulaw,
is_training=not is_eval,
verbose=verbose,
)
sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None
loader = DataLoader(
dataset,
batch_size=1 if is_eval else config.batch_size,
shuffle=num_gpus == 0,
collate_fn=dataset.collate,
sampler=sampler,
num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
pin_memory=True,
)
return loader
def get_criterion(self):
# define train functions
return WaveRNNLoss(self.args.mode)
@staticmethod
def init_from_config(config: "WavernnConfig"):
return Wavernn(config)