add stochastic_depth

Browse files

Files changed (3) hide show

block.py +26 -14
modeling_xlm_roberta.py +121 -61
stochastic_depth.py +97 -0

block.py CHANGED Viewed

@@ -10,8 +10,8 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from torchvision.ops import StochasticDepth
 from .mha import MHA
 from .mlp import Mlp
@@ -106,7 +106,9 @@ class Block(nn.Module):
                     p._shared_params = True
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
     def forward(
         self,
@@ -152,7 +154,7 @@ class Block(nn.Module):
                     rowscale=rowscale1,
                     prenorm=True,
                     residual_in_fp32=self.residual_in_fp32,
-                    is_rms_norm=isinstance(self.norm1, RMSNorm)
                 )
             if mixer_kwargs is None:
                 mixer_kwargs = {}
@@ -165,7 +167,9 @@ class Block(nn.Module):
                 if not self.fused_dropout_add_ln:
                     dropped = self.drop_path2(self.dropout2(hidden_states))
                     residual = (dropped + residual) if residual is not None else dropped
-                    hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
                     if self.residual_in_fp32:
                         residual = residual.to(torch.float32)
                 else:
@@ -189,7 +193,7 @@ class Block(nn.Module):
                         rowscale=rowscale2,
                         prenorm=True,
                         residual_in_fp32=self.residual_in_fp32,
-                        is_rms_norm=isinstance(self.norm2, RMSNorm)
                     )
                 hidden_states = self.mlp(hidden_states)
             return hidden_states, residual
@@ -212,7 +216,9 @@ class Block(nn.Module):
                 else:
                     rowscale1 = self.drop_path1(
                         torch.ones(
-                            mixer_out.shape[:-1], device=mixer_out.device, dtype=mixer_out.dtype
                         )
                     )
                 hidden_states = layer_norm_fn(
@@ -224,7 +230,7 @@ class Block(nn.Module):
                     dropout_p=self.dropout1.p if self.training else 0.0,
                     rowscale=rowscale1,
                     prenorm=False,
-                    is_rms_norm=isinstance(self.norm1, RMSNorm)
                 )
             if not isinstance(self.mlp, nn.Identity):
                 mlp_out = self.mlp(hidden_states)
@@ -242,7 +248,9 @@ class Block(nn.Module):
                     else:
                         rowscale2 = self.drop_path2(
                             torch.ones(
-                                mlp_out.shape[:-1], device=mlp_out.device, dtype=mlp_out.dtype
                             )
                         )
                     hidden_states = layer_norm_fn(
@@ -254,7 +262,7 @@ class Block(nn.Module):
                         dropout_p=self.dropout2.p if self.training else 0.0,
                         rowscale=rowscale2,
                         prenorm=False,
-                        is_rms_norm=isinstance(self.norm2, RMSNorm)
                     )
             return hidden_states
@@ -333,7 +341,9 @@ class ParallelBlock(nn.Module):
                     p._shared_params = True
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
     def forward(
         self,
@@ -373,7 +383,9 @@ class ParallelBlock(nn.Module):
                 residual = residual.to(torch.float32)
         else:
             weight2, bias2 = (
-                (self.norm2.weight, self.norm2.bias) if not self.tied_norm else (None, None)
             )
             hidden_states1, *rest, residual = layer_norm_fn(
                 hidden_states1,
@@ -387,14 +399,14 @@ class ParallelBlock(nn.Module):
                 dropout_p=self.dropout1.p if self.training else 0.0,
                 prenorm=True,
                 residual_in_fp32=self.residual_in_fp32,
-                is_rms_norm=isinstance(self.norm1, RMSNorm)
             )
             if self.tied_norm:
                 hidden_states2 = hidden_states1
             else:
-                hidden_states2, = rest
         if mixer_kwargs is None:
             mixer_kwargs = {}
         hidden_states1 = self.mixer(hidden_states1, **mixer_kwargs)
         hidden_states2 = self.mlp(hidden_states2)
-        return hidden_states1, hidden_states2, residual

 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
+from .stochastic_depth import StochasticDepth
 from .mha import MHA
 from .mlp import Mlp
                     p._shared_params = True
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(
+            batch_size, max_seqlen, dtype=dtype, **kwargs
+        )
     def forward(
         self,
                     rowscale=rowscale1,
                     prenorm=True,
                     residual_in_fp32=self.residual_in_fp32,
+                    is_rms_norm=isinstance(self.norm1, RMSNorm),
                 )
             if mixer_kwargs is None:
                 mixer_kwargs = {}
                 if not self.fused_dropout_add_ln:
                     dropped = self.drop_path2(self.dropout2(hidden_states))
                     residual = (dropped + residual) if residual is not None else dropped
+                    hidden_states = self.norm2(
+                        residual.to(dtype=self.norm2.weight.dtype)
+                    )
                     if self.residual_in_fp32:
                         residual = residual.to(torch.float32)
                 else:
                         rowscale=rowscale2,
                         prenorm=True,
                         residual_in_fp32=self.residual_in_fp32,
+                        is_rms_norm=isinstance(self.norm2, RMSNorm),
                     )
                 hidden_states = self.mlp(hidden_states)
             return hidden_states, residual
                 else:
                     rowscale1 = self.drop_path1(
                         torch.ones(
+                            mixer_out.shape[:-1],
+                            device=mixer_out.device,
+                            dtype=mixer_out.dtype,
                         )
                     )
                 hidden_states = layer_norm_fn(
                     dropout_p=self.dropout1.p if self.training else 0.0,
                     rowscale=rowscale1,
                     prenorm=False,
+                    is_rms_norm=isinstance(self.norm1, RMSNorm),
                 )
             if not isinstance(self.mlp, nn.Identity):
                 mlp_out = self.mlp(hidden_states)
                     else:
                         rowscale2 = self.drop_path2(
                             torch.ones(
+                                mlp_out.shape[:-1],
+                                device=mlp_out.device,
+                                dtype=mlp_out.dtype,
                             )
                         )
                     hidden_states = layer_norm_fn(
                         dropout_p=self.dropout2.p if self.training else 0.0,
                         rowscale=rowscale2,
                         prenorm=False,
+                        is_rms_norm=isinstance(self.norm2, RMSNorm),
                     )
             return hidden_states
                     p._shared_params = True
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(
+            batch_size, max_seqlen, dtype=dtype, **kwargs
+        )
     def forward(
         self,
                 residual = residual.to(torch.float32)
         else:
             weight2, bias2 = (
+                (self.norm2.weight, self.norm2.bias)
+                if not self.tied_norm
+                else (None, None)
             )
             hidden_states1, *rest, residual = layer_norm_fn(
                 hidden_states1,
                 dropout_p=self.dropout1.p if self.training else 0.0,
                 prenorm=True,
                 residual_in_fp32=self.residual_in_fp32,
+                is_rms_norm=isinstance(self.norm1, RMSNorm),
             )
             if self.tied_norm:
                 hidden_states2 = hidden_states1
             else:
+                (hidden_states2,) = rest
         if mixer_kwargs is None:
             mixer_kwargs = {}
         hidden_states1 = self.mixer(hidden_states1, **mixer_kwargs)
         hidden_states2 = self.mlp(hidden_states2)
+        return hidden_states1, hidden_states2, residual

modeling_xlm_roberta.py CHANGED Viewed

@@ -42,6 +42,7 @@ from .block import Block
 from .embedding import XLMRobertaEmbeddings
 from .mha import MHA
 from .mlp import FusedMLP, Mlp
 try:
@@ -69,10 +70,16 @@ def create_mixer_cls(config, cross_attn=False, return_residual=False):
     fused_bias_fc = getattr(config, "fused_bias_fc", False)
     rotary_kwargs = {}
     if config.position_embedding_type == "rotary":
-        rotary_kwargs["rotary_emb_dim"] = getattr(config, "rotary_emb_dim", config.hidden_size)
         rotary_kwargs["rotary_emb_base"] = getattr(config, "rotary_emb_base", 10000.0)
-        rotary_kwargs["rotary_emb_scale_base"] = getattr(config, "rotary_emb_scale_base", None)
-        rotary_kwargs["rotary_emb_interleaved"] = getattr(config, "rotary_emb_interleaved", False)
     mixer_cls = partial(
         MHA,
         num_heads=config.num_attention_heads,
@@ -183,7 +190,9 @@ class XLMRobertaEncoder(nn.Module):
         """
         if key_padding_mask is None or not self.use_flash_attn:
             mixer_kwargs = (
-                {"key_padding_mask": key_padding_mask.bool()} if key_padding_mask is not None else None
             )
             for layer in self.layers:
                 if self._grad_checkpointing:
@@ -191,7 +200,7 @@ class XLMRobertaEncoder(nn.Module):
                         layer,
                         hidden_states,
                         use_reentrant=False,
-                        mixer_kwargs=mixer_kwargs
                     )
                 else:
                     hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
@@ -210,7 +219,7 @@ class XLMRobertaEncoder(nn.Module):
                             layer,
                             hidden_states,
                             use_reentrant=False,
-                            mixer_kwargs=mixer_kwargs
                         )
                     else:
                         hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
@@ -222,7 +231,7 @@ class XLMRobertaEncoder(nn.Module):
                             layer,
                             hidden_states,
                             use_reentrant=False,
-                            mixer_kwargs=mixer_kwargs
                         )
                     else:
                         hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
@@ -230,15 +239,19 @@ class XLMRobertaEncoder(nn.Module):
                     subset_idx = torch.nonzero(
                         subset_mask[key_padding_mask], as_tuple=False
                     ).flatten()
-                    subset_seqlens = (subset_mask & key_padding_mask).sum(dim=-1, dtype=torch.int32)
                     subset_cu_seqlens = F.pad(
-                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
                     )
                 else:
                     subset_idx = torch.nonzero(subset_mask, as_tuple=False).flatten()
                     subset_seqlens = subset_mask.sum(dim=-1, dtype=torch.int32)
                     subset_cu_seqlens = F.pad(
-                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
                     )
                 hidden_states_subset, hidden_states = index_first_axis_residual(
                     hidden_states, subset_idx
@@ -256,10 +269,12 @@ class XLMRobertaEncoder(nn.Module):
                         self.layers[-1],
                         hidden_states_subset,
                         use_reentrant=False,
-                        mixer_kwargs=mixer_kwargs
                     )
                 else:
-                    hidden_states = self.layers[-1](hidden_states_subset, mixer_kwargs=mixer_kwargs)
         return hidden_states
@@ -308,7 +323,10 @@ class XLMRobertaPredictionHeadTransform(nn.Module):
             hidden_states = self.layer_norm(hidden_states)
         else:
             hidden_states = layer_norm_fn(
-                hidden_states, self.layer_norm.weight, self.layer_norm.bias, eps=self.layer_norm.eps
             )
         return hidden_states
@@ -349,6 +367,7 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
     """An abstract class to handle weights initialization and
     a simple interface for dowloading and loading pretrained models.
     """
     config_class = XLMRobertaFlashConfig
     base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
@@ -358,7 +377,6 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
             module.gradient_checkpointing = value
 class XLMRobertaModel(XLMRobertaPreTrainedModel):
     def __init__(self, config: XLMRobertaFlashConfig, add_pooling_layer=True):
         super().__init__(config)
@@ -370,7 +388,12 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
         if self.fused_dropout_add_ln and layer_norm_fn is None:
             raise ImportError("Triton is not installed")
-        assert config.hidden_act in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
         self.embeddings = XLMRobertaEmbeddings(
             config.hidden_size,
@@ -386,7 +409,6 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
     def forward(
         self,
         input_ids,
@@ -406,9 +428,14 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         if kwargs:
             for key, value in kwargs.items():
                 if value is not None:
-                    logger.warning('Flash attention implementation does not support kwargs: %s', key)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         hidden_states = self.embeddings(
             input_ids, position_ids=position_ids, token_type_ids=token_type_ids
@@ -439,17 +466,23 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         )
         if masked_tokens_mask is None:
-            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         else:
             # TD [2022-03-01]: the indexing here is very tricky.
             if attention_mask is not None:
                 subset_idx = subset_mask[attention_mask]
                 pool_input = sequence_output[first_col_mask[attention_mask][subset_idx]]
-                sequence_output = sequence_output[masked_tokens_mask[attention_mask][subset_idx]]
             else:
                 pool_input = sequence_output[first_col_mask[subset_mask]]
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
-            pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
         if not return_dict:
             return sequence_output, pooled_output
@@ -487,7 +520,6 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head.decoder = new_embeddings
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -511,7 +543,9 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
         kwargs (`Dict[str, any]`, optional, defaults to *{}*):
             Used to hide legacy arguments that have been deprecated.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         outputs = self.roberta(
             input_ids,
@@ -534,11 +568,15 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
             # move labels to correct device to enable model parallelism
             labels = labels.to(prediction_scores.device)
             loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
         if not return_dict:
             output = (prediction_scores,) + outputs[2:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
         return MaskedLMOutput(
             loss=masked_lm_loss,
@@ -656,7 +694,9 @@ def remap_state_dict(state_dict, config: PretrainedConfig):
         key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
         return key
-    state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
     # Layers
     def key_mapping_layers(key):
@@ -715,12 +755,18 @@ def remap_state_dict(state_dict, config: PretrainedConfig):
             state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
                 [Wq, Wk, Wv], dim=0
             )
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
         else:
             state_dict[f"bert.encoder.layers.{d}.mixer.Wq.weight"] = Wq
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
             state_dict[f"bert.encoder.layers.{d}.mixer.Wq.bias"] = bq
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat([bk, bv], dim=0)
     def key_mapping_attn(key):
         return re.sub(
@@ -734,7 +780,9 @@ def remap_state_dict(state_dict, config: PretrainedConfig):
     def key_mapping_decoder_bias(key):
         return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
-    state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
     # Word embedding
     pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
@@ -774,51 +822,59 @@ def inv_remap_state_dict(state_dict, config: PretrainedConfig):
         state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings[
             : config.orig_vocab_size, :
         ]
-        state_dict["cls.predictions.decoder.weight"] = decoder_weight[: config.orig_vocab_size, :]
-        state_dict["cls.predictions.decoder.bias"] = decoder_bias[: config.orig_vocab_size]
     for d in range(config.num_hidden_layers):
         last_layer_subset = getattr(config, "last_layer_subset", False)
         if not last_layer_subset or d != (config.num_hidden_layers - 1):
             Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
             Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wqkv_weights[
-                : Wqkv_weights.shape[0] // 3, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wqkv_weights[
                 Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
             ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wqkv_weights[
-                2 * Wqkv_weights.shape[0] // 3 :, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wqkv_biases[
-                : Wqkv_biases.shape[0] // 3
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wqkv_biases[
-                Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wqkv_biases[
-                2 * Wqkv_biases.shape[0] // 3 :
-            ]
         else:
             Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
             Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
             Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
             Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wq_weight
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wkv_weights[
-                : Wkv_weights.shape[0] // 2, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wkv_weights[
-                Wkv_weights.shape[0] // 2 :, :
-            ]
             state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
             state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
                 : Wkv_biases.shape[0] // 2
             ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wkv_biases[
-                Wkv_biases.shape[0] // 2 :
-            ]
     def inv_key_mapping_ln(key):
         key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)
@@ -870,14 +926,18 @@ def inv_remap_state_dict(state_dict, config: PretrainedConfig):
     def inv_key_mapping_decoder_bias(key):
         return re.sub(r"cls.predictions.decoder.bias", "cls.predictions.bias", key)
-    state_dict = OrderedDict((inv_key_mapping_ln(key), value) for key, value in state_dict.items())
     state_dict = OrderedDict(
         (inv_key_mapping_ln_gamma_beta(key), value) for key, value in state_dict.items()
     )
     state_dict = OrderedDict(
         (inv_key_mapping_layers(key), value) for key, value in state_dict.items()
     )
-    state_dict = OrderedDict((inv_key_mapping_mlp(key), value) for key, value in state_dict.items())
     state_dict = OrderedDict(
         (inv_key_mapping_attn(key), value) for key, value in state_dict.items()
     )
@@ -885,4 +945,4 @@ def inv_remap_state_dict(state_dict, config: PretrainedConfig):
         (inv_key_mapping_decoder_bias(key), value) for key, value in state_dict.items()
     )
-    return state_dict

 from .embedding import XLMRobertaEmbeddings
 from .mha import MHA
 from .mlp import FusedMLP, Mlp
+from .stochastic_depth import StochasticDepth
 try:
     fused_bias_fc = getattr(config, "fused_bias_fc", False)
     rotary_kwargs = {}
     if config.position_embedding_type == "rotary":
+        rotary_kwargs["rotary_emb_dim"] = getattr(
+            config, "rotary_emb_dim", config.hidden_size
+        )
         rotary_kwargs["rotary_emb_base"] = getattr(config, "rotary_emb_base", 10000.0)
+        rotary_kwargs["rotary_emb_scale_base"] = getattr(
+            config, "rotary_emb_scale_base", None
+        )
+        rotary_kwargs["rotary_emb_interleaved"] = getattr(
+            config, "rotary_emb_interleaved", False
+        )
     mixer_cls = partial(
         MHA,
         num_heads=config.num_attention_heads,
         """
         if key_padding_mask is None or not self.use_flash_attn:
             mixer_kwargs = (
+                {"key_padding_mask": key_padding_mask.bool()}
+                if key_padding_mask is not None
+                else None
             )
             for layer in self.layers:
                 if self._grad_checkpointing:
                         layer,
                         hidden_states,
                         use_reentrant=False,
+                        mixer_kwargs=mixer_kwargs,
                     )
                 else:
                     hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                             layer,
                             hidden_states,
                             use_reentrant=False,
+                            mixer_kwargs=mixer_kwargs,
                         )
                     else:
                         hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                             layer,
                             hidden_states,
                             use_reentrant=False,
+                            mixer_kwargs=mixer_kwargs,
                         )
                     else:
                         hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                     subset_idx = torch.nonzero(
                         subset_mask[key_padding_mask], as_tuple=False
                     ).flatten()
+                    subset_seqlens = (subset_mask & key_padding_mask).sum(
+                        dim=-1, dtype=torch.int32
+                    )
                     subset_cu_seqlens = F.pad(
+                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32),
+                        (1, 0),
                     )
                 else:
                     subset_idx = torch.nonzero(subset_mask, as_tuple=False).flatten()
                     subset_seqlens = subset_mask.sum(dim=-1, dtype=torch.int32)
                     subset_cu_seqlens = F.pad(
+                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32),
+                        (1, 0),
                     )
                 hidden_states_subset, hidden_states = index_first_axis_residual(
                     hidden_states, subset_idx
                         self.layers[-1],
                         hidden_states_subset,
                         use_reentrant=False,
+                        mixer_kwargs=mixer_kwargs,
                     )
                 else:
+                    hidden_states = self.layers[-1](
+                        hidden_states_subset, mixer_kwargs=mixer_kwargs
+                    )
         return hidden_states
             hidden_states = self.layer_norm(hidden_states)
         else:
             hidden_states = layer_norm_fn(
+                hidden_states,
+                self.layer_norm.weight,
+                self.layer_norm.bias,
+                eps=self.layer_norm.eps,
             )
         return hidden_states
     """An abstract class to handle weights initialization and
     a simple interface for dowloading and loading pretrained models.
     """
     config_class = XLMRobertaFlashConfig
     base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
             module.gradient_checkpointing = value
 class XLMRobertaModel(XLMRobertaPreTrainedModel):
     def __init__(self, config: XLMRobertaFlashConfig, add_pooling_layer=True):
         super().__init__(config)
         self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
         if self.fused_dropout_add_ln and layer_norm_fn is None:
             raise ImportError("Triton is not installed")
+        assert config.hidden_act in [
+            "gelu",
+            "gelu_new",
+            "gelu_fast",
+            "gelu_pytorch_tanh",
+        ]
         self.embeddings = XLMRobertaEmbeddings(
             config.hidden_size,
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
     def forward(
         self,
         input_ids,
         if kwargs:
             for key, value in kwargs.items():
                 if value is not None:
+                    logger.warning(
+                        'Flash attention implementation does not support kwargs: %s',
+                        key,
+                    )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         hidden_states = self.embeddings(
             input_ids, position_ids=position_ids, token_type_ids=token_type_ids
         )
         if masked_tokens_mask is None:
+            pooled_output = (
+                self.pooler(sequence_output) if self.pooler is not None else None
+            )
         else:
             # TD [2022-03-01]: the indexing here is very tricky.
             if attention_mask is not None:
                 subset_idx = subset_mask[attention_mask]
                 pool_input = sequence_output[first_col_mask[attention_mask][subset_idx]]
+                sequence_output = sequence_output[
+                    masked_tokens_mask[attention_mask][subset_idx]
+                ]
             else:
                 pool_input = sequence_output[first_col_mask[subset_mask]]
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
+            pooled_output = (
+                self.pooler(pool_input, pool=False) if self.pooler is not None else None
+            )
         if not return_dict:
             return sequence_output, pooled_output
     def set_output_embeddings(self, new_embeddings):
         self.lm_head.decoder = new_embeddings
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         kwargs (`Dict[str, any]`, optional, defaults to *{}*):
             Used to hide legacy arguments that have been deprecated.
         """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         outputs = self.roberta(
             input_ids,
             # move labels to correct device to enable model parallelism
             labels = labels.to(prediction_scores.device)
             loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+            )
         if not return_dict:
             output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
         return MaskedLMOutput(
             loss=masked_lm_loss,
         key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
         return key
+    state_dict = OrderedDict(
+        (key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items()
+    )
     # Layers
     def key_mapping_layers(key):
             state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
                 [Wq, Wk, Wv], dim=0
             )
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat(
+                [bq, bk, bv], dim=0
+            )
         else:
             state_dict[f"bert.encoder.layers.{d}.mixer.Wq.weight"] = Wq
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat(
+                [Wk, Wv], dim=0
+            )
             state_dict[f"bert.encoder.layers.{d}.mixer.Wq.bias"] = bq
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat(
+                [bk, bv], dim=0
+            )
     def key_mapping_attn(key):
         return re.sub(
     def key_mapping_decoder_bias(key):
         return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
+    state_dict = OrderedDict(
+        (key_mapping_decoder_bias(k), v) for k, v in state_dict.items()
+    )
     # Word embedding
     pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
         state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings[
             : config.orig_vocab_size, :
         ]
+        state_dict["cls.predictions.decoder.weight"] = decoder_weight[
+            : config.orig_vocab_size, :
+        ]
+        state_dict["cls.predictions.decoder.bias"] = decoder_bias[
+            : config.orig_vocab_size
+        ]
     for d in range(config.num_hidden_layers):
         last_layer_subset = getattr(config, "last_layer_subset", False)
         if not last_layer_subset or d != (config.num_hidden_layers - 1):
             Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
             Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.query.weight"
+            ] = Wqkv_weights[: Wqkv_weights.shape[0] // 3, :]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.key.weight"
+            ] = Wqkv_weights[
                 Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
             ]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.value.weight"
+            ] = Wqkv_weights[2 * Wqkv_weights.shape[0] // 3 :, :]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.query.bias"
+            ] = Wqkv_biases[: Wqkv_biases.shape[0] // 3]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.key.bias"
+            ] = Wqkv_biases[Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.value.bias"
+            ] = Wqkv_biases[2 * Wqkv_biases.shape[0] // 3 :]
         else:
             Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
             Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
             Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
             Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.query.weight"
+            ] = Wq_weight
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.key.weight"
+            ] = Wkv_weights[: Wkv_weights.shape[0] // 2, :]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.value.weight"
+            ] = Wkv_weights[Wkv_weights.shape[0] // 2 :, :]
             state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
             state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
                 : Wkv_biases.shape[0] // 2
             ]
+            state_dict[
+                f"bert.encoder.layers.{d}.attention.self.value.bias"
+            ] = Wkv_biases[Wkv_biases.shape[0] // 2 :]
     def inv_key_mapping_ln(key):
         key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)
     def inv_key_mapping_decoder_bias(key):
         return re.sub(r"cls.predictions.decoder.bias", "cls.predictions.bias", key)
+    state_dict = OrderedDict(
+        (inv_key_mapping_ln(key), value) for key, value in state_dict.items()
+    )
     state_dict = OrderedDict(
         (inv_key_mapping_ln_gamma_beta(key), value) for key, value in state_dict.items()
     )
     state_dict = OrderedDict(
         (inv_key_mapping_layers(key), value) for key, value in state_dict.items()
     )
+    state_dict = OrderedDict(
+        (inv_key_mapping_mlp(key), value) for key, value in state_dict.items()
+    )
     state_dict = OrderedDict(
         (inv_key_mapping_attn(key), value) for key, value in state_dict.items()
     )
         (inv_key_mapping_decoder_bias(key), value) for key, value in state_dict.items()
     )
+    return state_dict

stochastic_depth.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Implementation modified from torchvision:
+# https://github.com/pytorch/vision/blob/main/torchvision/ops/stochastic_depth.py
+#
+# License:
+# BSD 3-Clause License
+#
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import torch
+import torch.fx
+from torch import nn, Tensor
+def stochastic_depth(
+    input: Tensor, p: float, mode: str, training: bool = True
+) -> Tensor:
+    """
+    Implements the Stochastic Depth from `"Deep Networks with Stochastic Depth"
+    <https://arxiv.org/abs/1603.09382>`_ used for randomly dropping residual
+    branches of residual architectures.
+    Args:
+        input (Tensor[N, ...]): The input tensor or arbitrary dimensions with the first one
+                    being its batch i.e. a batch with ``N`` rows.
+        p (float): probability of the input to be zeroed.
+        mode (str): ``"batch"`` or ``"row"``.
+                    ``"batch"`` randomly zeroes the entire input, ``"row"`` zeroes
+                    randomly selected rows from the batch.
+        training: apply stochastic depth if is ``True``. Default: ``True``
+    Returns:
+        Tensor[N, ...]: The randomly zeroed tensor.
+    """
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"drop probability has to be between 0 and 1, but got {p}")
+    if mode not in ["batch", "row"]:
+        raise ValueError(f"mode has to be either 'batch' or 'row', but got {mode}")
+    if not training or p == 0.0:
+        return input
+    survival_rate = 1.0 - p
+    if mode == "row":
+        size = [input.shape[0]] + [1] * (input.ndim - 1)
+    else:
+        size = [1] * input.ndim
+    noise = torch.empty(size, dtype=input.dtype, device=input.device)
+    noise = noise.bernoulli_(survival_rate)
+    if survival_rate > 0.0:
+        noise.div_(survival_rate)
+    return input * noise
+torch.fx.wrap("stochastic_depth")
+class StochasticDepth(nn.Module):
+    """
+    See :func:`stochastic_depth`.
+    """
+    def __init__(self, p: float, mode: str) -> None:
+        super().__init__()
+        self.p = p
+        self.mode = mode
+    def forward(self, input: Tensor) -> Tensor:
+        return stochastic_depth(input, self.p, self.mode, self.training)
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(p={self.p}, mode={self.mode})"
+        return s