|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from torch.utils.checkpoint import checkpoint |
|
|
|
from tortoise_tts.models.arch_util import Upsample, Downsample, normalization, zero_module, AttentionBlock |
|
|
|
|
|
class ResBlock(nn.Module): |
|
def __init__( |
|
self, |
|
channels, |
|
dropout, |
|
out_channels=None, |
|
use_conv=False, |
|
use_scale_shift_norm=False, |
|
dims=2, |
|
up=False, |
|
down=False, |
|
kernel_size=3, |
|
do_checkpoint=True, |
|
): |
|
super().__init__() |
|
self.channels = channels |
|
self.dropout = dropout |
|
self.out_channels = out_channels or channels |
|
self.use_conv = use_conv |
|
self.use_scale_shift_norm = use_scale_shift_norm |
|
self.do_checkpoint = do_checkpoint |
|
padding = 1 if kernel_size == 3 else 2 |
|
|
|
self.in_layers = nn.Sequential( |
|
normalization(channels), |
|
nn.SiLU(), |
|
nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), |
|
) |
|
|
|
self.updown = up or down |
|
|
|
if up: |
|
self.h_upd = Upsample(channels, False, dims) |
|
self.x_upd = Upsample(channels, False, dims) |
|
elif down: |
|
self.h_upd = Downsample(channels, False, dims) |
|
self.x_upd = Downsample(channels, False, dims) |
|
else: |
|
self.h_upd = self.x_upd = nn.Identity() |
|
|
|
self.out_layers = nn.Sequential( |
|
normalization(self.out_channels), |
|
nn.SiLU(), |
|
nn.Dropout(p=dropout), |
|
zero_module( |
|
nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding) |
|
), |
|
) |
|
|
|
if self.out_channels == channels: |
|
self.skip_connection = nn.Identity() |
|
elif use_conv: |
|
self.skip_connection = nn.Conv1d( |
|
dims, channels, self.out_channels, kernel_size, padding=padding |
|
) |
|
else: |
|
self.skip_connection = nn.Conv1d(dims, channels, self.out_channels, 1) |
|
|
|
def forward(self, x): |
|
if self.do_checkpoint: |
|
return checkpoint( |
|
self._forward, x |
|
) |
|
else: |
|
return self._forward(x) |
|
|
|
def _forward(self, x): |
|
if self.updown: |
|
in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] |
|
h = in_rest(x) |
|
h = self.h_upd(h) |
|
x = self.x_upd(x) |
|
h = in_conv(h) |
|
else: |
|
h = self.in_layers(x) |
|
h = self.out_layers(h) |
|
return self.skip_connection(x) + h |
|
|
|
|
|
class AudioMiniEncoder(nn.Module): |
|
def __init__(self, |
|
spec_dim, |
|
embedding_dim, |
|
base_channels=128, |
|
depth=2, |
|
resnet_blocks=2, |
|
attn_blocks=4, |
|
num_attn_heads=4, |
|
dropout=0, |
|
downsample_factor=2, |
|
kernel_size=3): |
|
super().__init__() |
|
self.init = nn.Sequential( |
|
nn.Conv1d(spec_dim, base_channels, 3, padding=1) |
|
) |
|
ch = base_channels |
|
res = [] |
|
self.layers = depth |
|
for l in range(depth): |
|
for r in range(resnet_blocks): |
|
res.append(ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size)) |
|
res.append(Downsample(ch, use_conv=True, out_channels=ch*2, factor=downsample_factor)) |
|
ch *= 2 |
|
self.res = nn.Sequential(*res) |
|
self.final = nn.Sequential( |
|
normalization(ch), |
|
nn.SiLU(), |
|
nn.Conv1d(ch, embedding_dim, 1) |
|
) |
|
attn = [] |
|
for a in range(attn_blocks): |
|
attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False)) |
|
self.attn = nn.Sequential(*attn) |
|
self.dim = embedding_dim |
|
|
|
def forward(self, x): |
|
h = self.init(x) |
|
h = self.res(h) |
|
h = self.final(h) |
|
for blk in self.attn: |
|
h = checkpoint(blk, h) |
|
return h[:, :, 0] |
|
|
|
|
|
class AudioMiniEncoderWithClassifierHead(nn.Module): |
|
def __init__(self, classes, distribute_zero_label=True, **kwargs): |
|
super().__init__() |
|
self.enc = AudioMiniEncoder(**kwargs) |
|
self.head = nn.Linear(self.enc.dim, classes) |
|
self.num_classes = classes |
|
self.distribute_zero_label = distribute_zero_label |
|
|
|
def forward(self, x, labels=None): |
|
h = self.enc(x) |
|
logits = self.head(h) |
|
if labels is None: |
|
return logits |
|
else: |
|
if self.distribute_zero_label: |
|
oh_labels = nn.functional.one_hot(labels, num_classes=self.num_classes) |
|
zeros_indices = (labels == 0).unsqueeze(-1) |
|
|
|
zero_extra_mass = torch.full_like(oh_labels, dtype=torch.float, fill_value=.2/(self.num_classes-1)) |
|
zero_extra_mass[:, 0] = -.2 |
|
zero_extra_mass = zero_extra_mass * zeros_indices |
|
oh_labels = oh_labels + zero_extra_mass |
|
else: |
|
oh_labels = labels |
|
loss = nn.functional.cross_entropy(logits, oh_labels) |
|
return loss |
|
|