Speech / Notebook.py
NeerAbhy's picture
Create Notebook.py
8bf30cb verified
raw
history blame
4.78 kB
import torchaudio
class TextTransform:
def __init__(self):
char_map_str = """
' 0
<SPACE> 1
a 2
b 3
c 4
d 5
e 6
f 7
g 8
h 9
i 10
j 11
k 12
l 13
m 14
n 15
o 16
p 17
q 18
r 19
s 20
t 21
u 22
v 23
w 24
x 25
y 26
z 27
"""
self.char_map = {}
self.index_map = {}
for line in char_map_str.strip().split('\n'):
ch, index = line.split()
self.char_map[ch] = int(index)
self.index_map[int(index)] = ch
self.index_map[1] = ' '
def text_to_int(self, text):
int_sequence = []
for c in text:
if c == ' ':
ch = self.char_map['<SPACE>']
else:
ch = self.char_map[c]
int_sequence.append(ch)
return int_sequence
def int_to_text(self,labels):
string = []
for i in labels:
string.append(self.index_map[i])
return ''.join(string).replace('<SPACE>', ' ')
from torch import nn
trainaudio_transforms = nn.Sequential(
torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128),
torchaudio.transforms.FrequencyMasking(freq_mask_param = 15),
torchaudio.transforms.TimeMasking(time_mask_param = 35))
text_transform = TextTransform()
import torch.nn.functional as F
class CNNLayerNorm(nn.Module):
def __init__(self, n_feats):
super(CNNLayerNorm, self).__init__()
self.layer_norm = nn.LayerNorm(n_feats)
def forward(self, x):
x = x.transpose(2,3).contiguous()
x = self.layer_norm(x)
return x.transpose(2,3).contiguous()
class ResidualCNN(nn.Module):
def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
super(ResidualCNN, self).__init__()
self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2)
self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.layernorm1 = CNNLayerNorm(n_feats)
self.layernorm2 = CNNLayerNorm(n_feats)
def forward(self, x):
residual = x
x = self.layernorm1(x)
x = self.dropout1(x)
x = F.gelu(x)
x = self.cnn1(x)
x = self.layernorm2(x)
x = self.dropout2(x)
x = F.gelu(x)
x = self.cnn2(x)
x += residual
return x
class BiDirectionalGRU(nn.Module):
def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
super(BiDirectionalGRU, self).__init__()
self.BiGRU = nn.GRU(
input_size = rnn_dim, hidden_size = hidden_size,
num_layers = 1, batch_first = batch_first, bidirectional = True)
self.layernorm = nn.LayerNorm(rnn_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.layernorm(x)
x = F.gelu(x)
x, _ = self.BiGRU(x)
x = self.dropout(x)
return x
class SpeechRecognitionModel(nn.Module):
def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1):
super(SpeechRecognitionModel, self).__init__()
n_feats = n_feats//2
self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2)
self.rescnn_layers = nn.Sequential(*[
ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats)
for _ in range(n_cnn_layers)
])
self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
self.birnn_layers = nn.Sequential(*[
BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
for i in range(n_rnn_layers)
])
self.classifier = nn.Sequential(
nn.Linear(rnn_dim*2, rnn_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(rnn_dim, n_class))
def forward(self, x):
x = self.cnn(x)
x = self.rescnn_layers(x)
sizes = x.size()
x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3])
x = x.transpose(1,2)
x = self.fully_connected(x)
x= self.birnn_layers(x)
x = self.classifier(x)
return x
import torch
import os
from pathlib import Path
learning_rate=5e-4
batch_size=16
epochs=5
libri_train_set = "train-clean-100"
libri_test_set = "test-clean"
hparams = {
"n_cnn_layers": 3,
"n_rnn_layers": 5,
"rnn_dim": 512,
"n_class": 29,
"n_feats": 128,
"stride":2,
"dropout": 0.1,
"learning_rate": learning_rate,
"batch_size": batch_size,
"epochs": epochs
}