import math import torch from torch import nn class PositionalEncoding(nn.Module): """Sinusoidal positional encoding for non-recurrent neural networks. Implementation based on "Attention Is All You Need" Args: channels (int): embedding size dropout_p (float): dropout rate applied to the output. max_len (int): maximum sequence length. use_scale (bool): whether to use a learnable scaling coefficient. """ def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False): super().__init__() if channels % 2 != 0: raise ValueError( "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels) ) self.max_len = max_len self.use_scale = use_scale if use_scale: self.scale = torch.nn.Parameter(torch.ones(1)) pe = torch.zeros(max_len, channels) position = torch.arange(0, max_len).unsqueeze(1) div_term = torch.pow(10000, torch.arange(0, channels, 2).float() / channels) pe[:, 0::2] = torch.sin(position.float() * div_term) pe[:, 1::2] = torch.cos(position.float() * div_term) pe = pe.unsqueeze(0).transpose(1, 2) self.register_buffer("pe", pe) if dropout_p > 0: self.dropout = nn.Dropout(p=dropout_p) self.channels = channels def forward(self, x, mask=None, first_idx=None, last_idx=None): """ Shapes: x: [B, C, T] mask: [B, 1, T] first_idx: int last_idx: int """ x = x * math.sqrt(self.channels) if first_idx is None: if self.pe.size(2) < x.size(2): raise RuntimeError( f"Sequence is {x.size(2)} but PositionalEncoding is" f" limited to {self.pe.size(2)}. See max_len argument." ) if mask is not None: pos_enc = self.pe[:, :, : x.size(2)] * mask else: pos_enc = self.pe[:, :, : x.size(2)] if self.use_scale: x = x + self.scale * pos_enc else: x = x + pos_enc else: if self.use_scale: x = x + self.scale * self.pe[:, :, first_idx:last_idx] else: x = x + self.pe[:, :, first_idx:last_idx] if hasattr(self, "dropout"): x = self.dropout(x) return x