Spaces:
No application file
No application file
File size: 9,229 Bytes
8cb4f3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import torch.nn as nn
from torchtext import data
import copy
import layers as layers
class Embedder(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.vocab_size = vocab_size
self.d_model = d_model
self.embed = nn.Embedding(vocab_size, d_model)
def forward(self, x):
return self.embed(x)
class EncoderLayer(nn.Module):
def __init__(self, d_model, heads, dropout=0.1):
"""An layer of the encoder. Contain a self-attention accepting padding mask
Args:
d_model: the inner dimension size of the layer
heads: number of heads used in the attention
dropout: applied dropout value during training
"""
super().__init__()
self.norm_1 = layers.Norm(d_model)
self.norm_2 = layers.Norm(d_model)
self.attn = layers.MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = layers.FeedForward(d_model, dropout=dropout)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
def forward(self, x, src_mask):
"""Run the encoding layer
Args:
x: the input (either embedding values or previous layer output), should be in shape [batch_size, src_len, d_model]
src_mask: the padding mask, should be [batch_size, 1, src_len]
Return:
an output that have the same shape as input, [batch_size, src_len, d_model]
the attention used [batch_size, src_len, src_len]
"""
x2 = self.norm_1(x)
# Self attention only
x_sa, sa = self.attn(x2, x2, x2, src_mask)
x = x + self.dropout_1(x_sa)
x2 = self.norm_2(x)
x = x + self.dropout_2(self.ff(x2))
return x, sa
class DecoderLayer(nn.Module):
def __init__(self, d_model, heads, dropout=0.1):
"""An layer of the decoder. Contain a self-attention that accept no-peeking mask and a normal attention tha t accept padding mask
Args:
d_model: the inner dimension size of the layer
heads: number of heads used in the attention
dropout: applied dropout value during training
"""
super().__init__()
self.norm_1 = layers.Norm(d_model)
self.norm_2 = layers.Norm(d_model)
self.norm_3 = layers.Norm(d_model)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
self.dropout_3 = nn.Dropout(dropout)
self.attn_1 = layers.MultiHeadAttention(heads, d_model, dropout=dropout)
self.attn_2 = layers.MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = layers.FeedForward(d_model, dropout=dropout)
def forward(self, x, memory, src_mask, trg_mask):
"""Run the decoding layer
Args:
x: the input (either embedding values or previous layer output), should be in shape [batch_size, tgt_len, d_model]
memory: the outputs of the encoding section, used for normal attention. [batch_size, src_len, d_model]
src_mask: the padding mask for the memory, [batch_size, 1, src_len]
tgt_mask: the no-peeking mask for the decoder, [batch_size, tgt_len, tgt_len]
Return:
an output that have the same shape as input, [batch_size, tgt_len, d_model]
the self-attention and normal attention received [batch_size, head, tgt_len, tgt_len] & [batch_size, head, tgt_len, src_len]
"""
x2 = self.norm_1(x)
# Self-attention
x_sa, sa = self.attn_1(x2, x2, x2, trg_mask)
x = x + self.dropout_1(x_sa)
x2 = self.norm_2(x)
# Normal multi-head attention
x_na, na = self.attn_2(x2, memory, memory, src_mask)
x = x + self.dropout_2(x_na)
x2 = self.norm_3(x)
x = x + self.dropout_3(self.ff(x2))
return x, (sa, na)
def get_clones(module, N, keep_module=True):
if(keep_module and N >= 1):
# create N-1 copies in addition to the original
return nn.ModuleList([module] + [copy.deepcopy(module) for i in range(N-1)])
else:
# create N new copy
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
class Encoder(nn.Module):
"""A wrapper that embed, positional encode, and self-attention encode the inputs.
Args:
vocab_size: the size of the vocab. Used for embedding
d_model: the inner dim of the module
N: number of layers used
heads: number of heads used in the attention
dropout: applied dropout value during training
max_seq_length: the maximum length value used for this encoder. Needed for PositionalEncoder, due to caching
"""
def __init__(self, vocab_size, d_model, N, heads, dropout, max_seq_length=200):
super().__init__()
self.N = N
self.embed = nn.Embedding(vocab_size, d_model)
self.pe = layers.PositionalEncoder(d_model, dropout=dropout, max_seq_length=max_seq_length)
self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
self.norm = layers.Norm(d_model)
self._max_seq_length = max_seq_length
def forward(self, src, src_mask, output_attention=False, seq_length_check=False):
"""Accepts a batch of indexed tokens, return the encoded values.
Args:
src: int Tensor of [batch_size, src_len]
src_mask: the padding mask, [batch_size, 1, src_len]
output_attention: if set, output a list containing used attention
seq_length_check: if set, automatically trim the input if it goes past the expected sequence length.
Returns:
the encoded values [batch_size, src_len, d_model]
if available, list of N (self-attention) calculated. They are in form of [batch_size, heads, src_len, src_len]
"""
if(seq_length_check and src.shape[1] > self._max_seq_length):
src = src[:, :self._max_seq_length]
src_mask = src_mask[:, :, :self._max_seq_length]
x = self.embed(src)
x = self.pe(x)
attentions = [None] * self.N
for i in range(self.N):
x, attn = self.layers[i](x, src_mask)
attentions[i] = attn
x = self.norm(x)
return x if(not output_attention) else (x, attentions)
class Decoder(nn.Module):
"""A wrapper that receive the encoder outputs, run through the decoder process for a determined input
Args:
vocab_size: the size of the vocab. Used for embedding
d_model: the inner dim of the module
N: number of layers used
heads: number of heads used in the attention
dropout: applied dropout value during training
max_seq_length: the maximum length value used for this encoder. Needed for PositionalEncoder, due to caching
"""
def __init__(self, vocab_size, d_model, N, heads, dropout, max_seq_length=200):
super().__init__()
self.N = N
self.embed = nn.Embedding(vocab_size, d_model)
self.pe = layers.PositionalEncoder(d_model, dropout=dropout, max_seq_length=max_seq_length)
self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
self.norm = layers.Norm(d_model)
self._max_seq_length = max_seq_length
def forward(self, trg, memory, src_mask, trg_mask, output_attention=False):
"""Accepts a batch of indexed tokens and the encoding outputs, return the decoded values.
Args:
trg: input Tensor of [batch_size, trg_len]
memory: output of Encoder [batch_size, src_len, d_model]
src_mask: the padding mask, [batch_size, 1, src_len]
trg_mask: the no-peeking mask, [batch_size, tgt_len, tgt_len]
output_attention: if set, output a list containing used attention
Returns:
the decoded values [batch_size, tgt_len, d_model]
if available, list of N (self-attention, attention) calculated. They are in form of [batch_size, heads, tgt_len, tgt/src_len]
"""
x = self.embed(trg)
x = self.pe(x)
attentions = [None] * self.N
for i in range(self.N):
x, attn = self.layers[i](x, memory, src_mask, trg_mask)
attentions[i] = attn
x = self.norm(x)
return x if(not output_attention) else (x, attentions)
class Config:
"""Deprecated"""
def __init__(self):
self.opt = {
'train_src_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/train.en',
'train_trg_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/train.vi',
'valid_src_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/tst2013.en',
'valid_trg_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/tst2013.vi',
'src_lang':'en', # useless atm
'trg_lang':'en',#'vi_spacy_model', # useless atm
'max_strlen':160,
'batchsize':1500,
'device':'cuda',
'd_model': 512,
'n_layers': 6,
'heads': 8,
'dropout': 0.1,
'lr':0.0001,
'epochs':30,
'printevery': 200,
'k':5,
}
|