Spaces:
Sleeping
Sleeping
import torchaudio | |
class TextTransform: | |
def __init__(self): | |
char_map_str = """ | |
' 0 | |
<SPACE> 1 | |
a 2 | |
b 3 | |
c 4 | |
d 5 | |
e 6 | |
f 7 | |
g 8 | |
h 9 | |
i 10 | |
j 11 | |
k 12 | |
l 13 | |
m 14 | |
n 15 | |
o 16 | |
p 17 | |
q 18 | |
r 19 | |
s 20 | |
t 21 | |
u 22 | |
v 23 | |
w 24 | |
x 25 | |
y 26 | |
z 27 | |
""" | |
self.char_map = {} | |
self.index_map = {} | |
for line in char_map_str.strip().split('\n'): | |
ch, index = line.split() | |
self.char_map[ch] = int(index) | |
self.index_map[int(index)] = ch | |
self.index_map[1] = ' ' | |
def text_to_int(self, text): | |
int_sequence = [] | |
for c in text: | |
if c == ' ': | |
ch = self.char_map['<SPACE>'] | |
else: | |
ch = self.char_map[c] | |
int_sequence.append(ch) | |
return int_sequence | |
def int_to_text(self,labels): | |
string = [] | |
for i in labels: | |
string.append(self.index_map[i]) | |
return ''.join(string).replace('<SPACE>', ' ') | |
from torch import nn | |
trainaudio_transforms = nn.Sequential( | |
torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128), | |
torchaudio.transforms.FrequencyMasking(freq_mask_param = 15), | |
torchaudio.transforms.TimeMasking(time_mask_param = 35)) | |
text_transform = TextTransform() | |
import torch.nn.functional as F | |
class CNNLayerNorm(nn.Module): | |
def __init__(self, n_feats): | |
super(CNNLayerNorm, self).__init__() | |
self.layer_norm = nn.LayerNorm(n_feats) | |
def forward(self, x): | |
x = x.transpose(2,3).contiguous() | |
x = self.layer_norm(x) | |
return x.transpose(2,3).contiguous() | |
class ResidualCNN(nn.Module): | |
def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats): | |
super(ResidualCNN, self).__init__() | |
self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2) | |
self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2) | |
self.dropout1 = nn.Dropout(dropout) | |
self.dropout2 = nn.Dropout(dropout) | |
self.layernorm1 = CNNLayerNorm(n_feats) | |
self.layernorm2 = CNNLayerNorm(n_feats) | |
def forward(self, x): | |
residual = x | |
x = self.layernorm1(x) | |
x = self.dropout1(x) | |
x = F.gelu(x) | |
x = self.cnn1(x) | |
x = self.layernorm2(x) | |
x = self.dropout2(x) | |
x = F.gelu(x) | |
x = self.cnn2(x) | |
x += residual | |
return x | |
class BiDirectionalGRU(nn.Module): | |
def __init__(self, rnn_dim, hidden_size, dropout, batch_first): | |
super(BiDirectionalGRU, self).__init__() | |
self.BiGRU = nn.GRU( | |
input_size = rnn_dim, hidden_size = hidden_size, | |
num_layers = 1, batch_first = batch_first, bidirectional = True) | |
self.layernorm = nn.LayerNorm(rnn_dim) | |
self.dropout = nn.Dropout(dropout) | |
def forward(self, x): | |
x = self.layernorm(x) | |
x = F.gelu(x) | |
x, _ = self.BiGRU(x) | |
x = self.dropout(x) | |
return x | |
class SpeechRecognitionModel(nn.Module): | |
def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1): | |
super(SpeechRecognitionModel, self).__init__() | |
n_feats = n_feats//2 | |
self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2) | |
self.rescnn_layers = nn.Sequential(*[ | |
ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats) | |
for _ in range(n_cnn_layers) | |
]) | |
self.fully_connected = nn.Linear(n_feats*32, rnn_dim) | |
self.birnn_layers = nn.Sequential(*[ | |
BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2, | |
hidden_size=rnn_dim, dropout=dropout, batch_first=i==0) | |
for i in range(n_rnn_layers) | |
]) | |
self.classifier = nn.Sequential( | |
nn.Linear(rnn_dim*2, rnn_dim), | |
nn.GELU(), | |
nn.Dropout(dropout), | |
nn.Linear(rnn_dim, n_class)) | |
def forward(self, x): | |
x = self.cnn(x) | |
x = self.rescnn_layers(x) | |
sizes = x.size() | |
x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3]) | |
x = x.transpose(1,2) | |
x = self.fully_connected(x) | |
x= self.birnn_layers(x) | |
x = self.classifier(x) | |
return x | |
import torch | |
import os | |
from pathlib import Path | |
learning_rate=5e-4 | |
batch_size=16 | |
epochs=5 | |
libri_train_set = "train-clean-100" | |
libri_test_set = "test-clean" | |
hparams = { | |
"n_cnn_layers": 3, | |
"n_rnn_layers": 5, | |
"rnn_dim": 512, | |
"n_class": 29, | |
"n_feats": 128, | |
"stride":2, | |
"dropout": 0.1, | |
"learning_rate": learning_rate, | |
"batch_size": batch_size, | |
"epochs": epochs | |
} | |