hexgrad
/

Kokoro-82M

Text-to-Speech

ONNX

English

Model card Files Files and versions Community

hexgrad commited on 3 days ago

Commit

96b9a7b

•

1 Parent(s): 3095858

Upload 3 files

Browse files

Files changed (2) hide show

README.md +1 -0
models.py +2 -219

README.md CHANGED Viewed

@@ -31,6 +31,7 @@ You can find a hosted demo at [hf.co/spaces/hexgrad/Kokoro-TTS](https://huggingf
 The following can be run in a single cell on [Google Colab](https://colab.research.google.com/).
 ```py
 # 1️⃣ Install dependencies silently
 !git clone https://huggingface.co/hexgrad/Kokoro-82M
 %cd Kokoro-82M
 !apt-get -qq -y install espeak-ng > /dev/null 2>&1

 The following can be run in a single cell on [Google Colab](https://colab.research.google.com/).
 ```py
 # 1️⃣ Install dependencies silently
+!git lfs install
 !git clone https://huggingface.co/hexgrad/Kokoro-82M
 %cd Kokoro-82M
 !apt-get -qq -y install espeak-ng > /dev/null 2>&1

models.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # https://github.com/yl4579/StyleTTS2/blob/main/models.py
-from istftnet import Decoder
 from munch import Munch
 from pathlib import Path
 from plbert import load_plbert
@@ -12,118 +12,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-class LearnedDownSample(nn.Module):
-    def __init__(self, layer_type, dim_in):
-        super().__init__()
-        self.layer_type = layer_type
-        if self.layer_type == 'none':
-            self.conv = nn.Identity()
-        elif self.layer_type == 'timepreserve':
-            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
-        elif self.layer_type == 'half':
-            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
-        else:
-            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-    def forward(self, x):
-        return self.conv(x)
-class LearnedUpSample(nn.Module):
-    def __init__(self, layer_type, dim_in):
-        super().__init__()
-        self.layer_type = layer_type
-        if self.layer_type == 'none':
-            self.conv = nn.Identity()
-        elif self.layer_type == 'timepreserve':
-            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
-        elif self.layer_type == 'half':
-            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
-        else:
-            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-    def forward(self, x):
-        return self.conv(x)
-class DownSample(nn.Module):
-    def __init__(self, layer_type):
-        super().__init__()
-        self.layer_type = layer_type
-    def forward(self, x):
-        if self.layer_type == 'none':
-            return x
-        elif self.layer_type == 'timepreserve':
-            return F.avg_pool2d(x, (2, 1))
-        elif self.layer_type == 'half':
-            if x.shape[-1] % 2 != 0:
-                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
-            return F.avg_pool2d(x, 2)
-        else:
-            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-class UpSample(nn.Module):
-    def __init__(self, layer_type):
-        super().__init__()
-        self.layer_type = layer_type
-    def forward(self, x):
-        if self.layer_type == 'none':
-            return x
-        elif self.layer_type == 'timepreserve':
-            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
-        elif self.layer_type == 'half':
-            return F.interpolate(x, scale_factor=2, mode='nearest')
-        else:
-            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-class ResBlk(nn.Module):
-    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
-                 normalize=False, downsample='none'):
-        super().__init__()
-        self.actv = actv
-        self.normalize = normalize
-        self.downsample = DownSample(downsample)
-        self.downsample_res = LearnedDownSample(downsample, dim_in)
-        self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out)
-    def _build_weights(self, dim_in, dim_out):
-        self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
-        self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
-        if self.normalize:
-            self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
-            self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
-        if self.learned_sc:
-            self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
-    def _shortcut(self, x):
-        if self.learned_sc:
-            x = self.conv1x1(x)
-        if self.downsample:
-            x = self.downsample(x)
-        return x
-    def _residual(self, x):
-        if self.normalize:
-            x = self.norm1(x)
-        x = self.actv(x)
-        x = self.conv1(x)
-        x = self.downsample_res(x)
-        if self.normalize:
-            x = self.norm2(x)
-        x = self.actv(x)
-        x = self.conv2(x)
-        return x
-    def forward(self, x):
-        x = self._shortcut(x) + self._residual(x)
-        return x / np.sqrt(2)  # unit variance
 class LinearNorm(torch.nn.Module):
     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
         super(LinearNorm, self).__init__()
@@ -136,98 +24,6 @@ class LinearNorm(torch.nn.Module):
     def forward(self, x):
         return self.linear_layer(x)
-class Discriminator2d(nn.Module):
-    def __init__(self, dim_in=48, num_domains=1, max_conv_dim=384, repeat_num=4):
-        super().__init__()
-        blocks = []
-        blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
-        for lid in range(repeat_num):
-            dim_out = min(dim_in*2, max_conv_dim)
-            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
-            dim_in = dim_out
-        blocks += [nn.LeakyReLU(0.2)]
-        blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
-        blocks += [nn.LeakyReLU(0.2)]
-        blocks += [nn.AdaptiveAvgPool2d(1)]
-        blocks += [spectral_norm(nn.Conv2d(dim_out, num_domains, 1, 1, 0))]
-        self.main = nn.Sequential(*blocks)
-    def get_feature(self, x):
-        features = []
-        for l in self.main:
-            x = l(x)
-            features.append(x)
-        out = features[-1]
-        out = out.view(out.size(0), -1)  # (batch, num_domains)
-        return out, features
-    def forward(self, x):
-        out, features = self.get_feature(x)
-        out = out.squeeze()  # (batch)
-        return out, features
-class ResBlk1d(nn.Module):
-    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
-                 normalize=False, downsample='none', dropout_p=0.2):
-        super().__init__()
-        self.actv = actv
-        self.normalize = normalize
-        self.downsample_type = downsample
-        self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out)
-        self.dropout_p = dropout_p
-        if self.downsample_type == 'none':
-            self.pool = nn.Identity()
-        else:
-            self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
-    def _build_weights(self, dim_in, dim_out):
-        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
-        self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
-        if self.normalize:
-            self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
-            self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
-        if self.learned_sc:
-            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
-    def downsample(self, x):
-        if self.downsample_type == 'none':
-            return x
-        else:
-            if x.shape[-1] % 2 != 0:
-                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
-            return F.avg_pool1d(x, 2)
-    def _shortcut(self, x):
-        if self.learned_sc:
-            x = self.conv1x1(x)
-        x = self.downsample(x)
-        return x
-    def _residual(self, x):
-        if self.normalize:
-            x = self.norm1(x)
-        x = self.actv(x)
-        x = F.dropout(x, p=self.dropout_p, training=self.training)
-        x = self.conv1(x)
-        x = self.pool(x)
-        if self.normalize:
-            x = self.norm2(x)
-        x = self.actv(x)
-        x = F.dropout(x, p=self.dropout_p, training=self.training)
-        x = self.conv2(x)
-        return x
-    def forward(self, x):
-        x = self._shortcut(x) + self._residual(x)
-        return x / np.sqrt(2)  # unit variance
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
@@ -306,19 +102,6 @@ class TextEncoder(nn.Module):
         return mask
-class AdaIN1d(nn.Module):
-    def __init__(self, style_dim, num_features):
-        super().__init__()
-        self.norm = nn.InstanceNorm1d(num_features, affine=False)
-        self.fc = nn.Linear(style_dim, num_features*2)
-    def forward(self, x, s):
-        h = self.fc(s)
-        h = h.view(h.size(0), h.size(1), 1)
-        gamma, beta = torch.chunk(h, chunks=2, dim=1)
-        return (1 + gamma) * self.norm(x) + beta
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
@@ -474,7 +257,7 @@ class ProsodyPredictor(nn.Module):
         mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
         mask = torch.gt(mask+1, lengths.unsqueeze(1))
         return mask
 class DurationEncoder(nn.Module):
     def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):

 # https://github.com/yl4579/StyleTTS2/blob/main/models.py
+from istftnet import AdaIN1d, Decoder
 from munch import Munch
 from pathlib import Path
 from plbert import load_plbert
 import torch.nn as nn
 import torch.nn.functional as F
 class LinearNorm(torch.nn.Module):
     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
         super(LinearNorm, self).__init__()
     def forward(self, x):
         return self.linear_layer(x)
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
         return mask
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
         mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
         mask = torch.gt(mask+1, lengths.unsqueeze(1))
         return mask
 class DurationEncoder(nn.Module):
     def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):