RITA_xl / rita_modeling.py

Create rita_modeling.py

78dcfea over 2 years ago

8.89 kB

	import math
	import os
	from dataclasses import dataclass
	from typing import Optional, Tuple, Union

	import torch
	import torch.utils.checkpoint
	from torch import nn
	from torch.nn import CrossEntropyLoss

	from transformers.modeling_outputs import (
	BaseModelOutputWithPast,
	BaseModelOutputWithPastAndCrossAttentions,
	CausalLMOutputWithCrossAttentions,
	CausalLMOutputWithPast,
	)

	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import logging

	from .rita_configuration import RITAConfig
	import torch.nn.functional as F
	logger = logging.get_logger(__name__)

	@torch.jit.script
	def RITA_gelu(hidden_states):
	return hidden_states * 0.5 * (1.0 + torch.tanh(0.79788456 * hidden_states * (1 + 0.044715 * hidden_states * hidden_states)))

	class RITAGELU(nn.Module):
	def __init__(self):
	super().__init__()

	def forward(self, hidden_states):
	return RITA_gelu(hidden_states)

	def rotate_half(x):
	x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=x1.ndim - 1)

	class RotaryEmbedding(nn.Module):
	def __init__(self, config):
	super().__init__()
	assert config.d_model % config.num_heads == 0

	self.d_model = config.d_model
	self.num_heads = config.num_heads
	self.max_seq_len = config.max_seq_len

	head_dim = self.d_model // self.num_heads
	inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
	self.register_buffer('inv_freq', inv_freq)
	self.seq_len_cached = None
	self.cos_cached = None
	self.sin_cached = None

	def forward(self, x: torch.FloatTensor, seq_dim=1) -> torch.FloatTensor:
	seq_len = x.shape[seq_dim]
	if seq_len != self.seq_len_cached:
	self.seq_len_cached = seq_len
	t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
	freqs = torch.einsum("i,j->ij", t, self.inv_freq)
	emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
	self.cos_cached = emb.cos()[None, None, :, :]
	self.sin_cached = emb.sin()[None, None, :, :]
	return self.cos_cached, self.sin_cached

	def apply_rotary_pos_emb(self, q, k, cos, sin):
	return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)


	class SelfAttention(nn.Module):
	"""Implementation of MultiHeadAttention following `Karpathy's MinGPT <https://github.com/karpathy/minGPT>`_.
	modified to use rotary embeddings.

	Parameters
	----------
	d_model: int,
	total dimension of the model.
	num_heads: int,
	number of parallel attention heads.
	num_layers: int,
	number of layers in the model, used for the Megatron-like init.
	rotaty_embedding: Optional[Block], default None,
	a RotaryEmbedding Block to add positionnal information in Queries and Keys
	dropout: float, default 0.1,
	amount of dropout on the attention weights.
	sigma: float, default 0.02,
	standard deviation used for the init.
	trainable: bool, default True,
	if False, the Module parameters will be hidden from the optimizer.
	"""

	def __init__(
	self,
	d_model: int,
	num_heads: int,
	num_layers: int,
	rotary_embedding= None,
	dropout: float = 0.1,
	sigma=0.02,
	use_cache: bool = False,
	bias=True,
	):
	super().__init__()
	assert d_model % num_heads == 0
	self.d_model = d_model
	self.num_heads = num_heads
	self.head_dim = self.d_model // self.num_heads
	self.num_layers = num_layers
	self.dropout = dropout
	self.sigma = sigma
	self.bias = bias

	# key, query, value projections for all heads
	self.key = nn.Linear(d_model, d_model, bias=bias)
	self.query = nn.Linear(d_model, d_model, bias=bias)
	self.value = nn.Linear(d_model, d_model, bias=bias)
	# regularization
	self.attn_drop = nn.Dropout(dropout)
	self.resid_drop = nn.Dropout(dropout)
	# output projection
	self.proj = nn.Linear(d_model, d_model, bias=bias)

	self.rotary_embedding = rotary_embedding
	self.layer_id = None # will be set by the Transformer itself
	self.use_cache = use_cache
	self.qkv = None
	self.bias = bias

	def forward(
	self,
	x,
	attn_mask: Optional[torch.BoolTensor] = None,
	padding_mask: Optional[torch.BoolTensor] = None,
	) -> Tuple[torch.FloatTensor, torch.FloatTensor]:

	N, L, D = x.size() # Batch_size, Context_size, d_model

	# calculate query, key, values for all heads in batch and move head forward to be the batch dim
	k = (
	self.key(x).view(N, L, self.num_heads, D // self.num_heads).transpose(1, 2)
	) # (N, nh, L, hs)
	q = (
	self.query(x).view(N, L, self.num_heads, D // self.num_heads).transpose(1, 2)
	) # (N, nh, L, hs)
	v = (
	self.value(x).view(N, L, self.num_heads, D // self.num_heads).transpose(1, 2)
	) # (N, nh, L, hs)

	if self.rotary_embedding is not None:
	cos, sin = self.rotary_embedding(x)
	q, k = self.rotary_embedding.apply_rotary_pos_emb(q, k, cos, sin)

	# causal self-attention; Self-attend: (N, nh, L, hs) x (N, nh, hs, L) -> (N, nh, L, L)
	att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

	if attn_mask is not None:
	att[:,:,-L:, -L: ].masked_fill_(attn_mask.view(1, 1, L, L), float("-inf"))

	att = (
	att.transpose(0, 2)
	.masked_fill(padding_mask.view(1, 1, N, L), float("-inf"))
	.transpose(0, 2)
	if padding_mask is not None
	else att
	)

	att = F.softmax(att, dim=-1)
	att = self.attn_drop(att)
	y = att @ v # (N, nh, L, L) x (N, nh, L, hs) -> (N, nh, L, hs)
	y = (
	y.transpose(1, 2).contiguous().view(N, L, D)
	) # re-assemble all head outputs side by side

	# output projection
	y = self.resid_drop(self.proj(y))
	return y

	class DecoderLayer(nn.Module):
	"""Transformer block containing the self-attention module and the feedfoward module."""

	def __init__(
	self, config
	):
	super().__init__()
	self.self_attention = SelfAttention(config.d_model, config.num_heads, config.dropout, rotary_embedding=RotaryEmbedding(config))
	self.attn_norm = nn.LayerNorm(config.d_model)
	self.attn_dropout = nn.Dropout(config.dropout)

	self.mlp = nn.Sequential(
	nn.Linear(config.d_model, config.d_feedforward, bias=True),
	RITAGELU(),
	nn.Linear(config.d_feedforward, config.d_model, bias=True),
	)
	self.mlp_norm = nn.LayerNorm(config.d_model)
	self.mlp_dropout = nn.Dropout(config.dropout)

	def forward(
	self,
	x: torch.FloatTensor,
	attn_mask: torch.BoolTensor,
	padding_mask: Optional[torch.BoolTensor] = None,
	) -> torch.FloatTensor:
	y = self.attn_norm(x)
	y = self.self_attention(y, attn_mask=attn_mask, padding_mask=padding_mask)
	x = x + self.attn_dropout(y)

	y = self.mlp_norm(x)
	y = self.mlp(y)
	x = x + self.mlp_dropout(y)
	return x

	class RITAModel(PreTrainedModel):
	config_class = RITAConfig
	def __init__(
	self,
	config
	):
	super().__init__(config)
	self.embedding = nn.Embedding(config.vocab_size, config.d_model)
	self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_layers)])
	self.final_norm = nn.LayerNorm(config.d_model)
	self.projector = nn.Linear(config.d_model, config.vocab_size, bias = False)

	def forward(self, input_ids, attn_mask=None, padding_mask=None, return_hidden=False) -> torch.FloatTensor:
	x = self.embedding(input_ids) # N x L x D
	if attn_mask == None:
	attn_mask = (torch.triu(torch.ones(input_ids.size(1), input_ids.size(1))) == 0).transpose(0, 1).contiguous().to(input_ids.device)
	for layer in self.layers:
	x = layer(x, attn_mask=attn_mask, padding_mask=padding_mask)
	x = self.final_norm(x) # N x L x D

	if return_hidden:
	return x
	else:
	return self.projector(x)

	#Some common HF functions.
	def get_input_embeddings(self):
	return self.embedding

	def set_input_embeddings(self, new_embeddings):
	self.embedding = new_embeddings

	def get_output_embeddings(self):
	return self.projector

	def set_output_embeddings(self, new_projector):
	self.projector = new_projector