flash_attention_2 (#51)

- Add eager and sdpa attention implementations (835c717962e2632f116db776a087970c22e4a6c1)
- Add support for flash attention 2 (a7eaddd0ac0e89cf779dce9596635369178e15cf)
- Merge branch 'main' into attention (29038ea19de709ec833a7ad9e86e838e274194f2)

Files changed (2) hide show

config.json +1 -0
modeling_chatglm.py +203 -81

config.json CHANGED Viewed

@@ -17,6 +17,7 @@
   "apply_residual_connection_post_layernorm": false,
   "attention_dropout": 0.0,
   "attention_softmax_in_fp32": true,
   "bias_dropout_fusion": true,
   "ffn_hidden_size": 13696,
   "fp32_residual_connection": false,

   "apply_residual_connection_post_layernorm": false,
   "attention_dropout": 0.0,
   "attention_softmax_in_fp32": true,
+  "attn_implementation": "sdpa",
   "bias_dropout_fusion": true,
   "ffn_hidden_size": 13696,
   "fp32_residual_connection": false,

modeling_chatglm.py CHANGED Viewed

@@ -21,12 +21,17 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging, is_torch_npu_available
 from transformers.generation.logits_process import LogitsProcessor
 from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
 from .configuration_chatglm import ChatGLMConfig
 # flags required to enable jit fusion kernels
 if sys.platform != 'darwin' and not is_torch_npu_available():
@@ -40,6 +45,7 @@ logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
 _CONFIG_FOR_DOC = "ChatGLMConfig"
 def default_init(cls, *args, **kwargs):
     return cls(*args, **kwargs)
@@ -159,12 +165,13 @@ class RMSNorm(torch.nn.Module):
 class CoreAttention(torch.nn.Module):
     def __init__(self, config: ChatGLMConfig, layer_number):
         super(CoreAttention, self).__init__()
         self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
         self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
         if self.apply_query_key_layer_scaling:
             self.attention_softmax_in_fp32 = True
         self.layer_number = max(1, layer_number)
         projection_size = config.kv_channels * config.num_attention_heads
@@ -183,91 +190,198 @@ class CoreAttention(torch.nn.Module):
         self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
     def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        pytorch_major_version = int(torch.__version__.split('.')[0])
-        if pytorch_major_version >= 2:
-            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
-            else:
-                if attention_mask is not None:
-                    attention_mask = ~attention_mask
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 attention_mask)
-            context_layer = context_layer.transpose(1, 2).contiguous()
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
         else:
-            # Raw attention scores
-            # [b, np, sq, sk]
-            output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
-            # [b, np, sq, hn] -> [b * np, sq, hn]
-            query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
-            # [b, np, sk, hn] -> [b * np, sk, hn]
-            key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
-            # preallocting input tensor: [b * np, sq, sk]
-            matmul_input_buffer = torch.empty(
-                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
-                device=query_layer.device
             )
-            # Raw attention scores. [b * np, sq, sk]
-            matmul_result = torch.baddbmm(
-                matmul_input_buffer,
-                query_layer,  # [b * np, sq, hn]
-                key_layer.transpose(1, 2),  # [b * np, hn, sk]
-                beta=0.0,
-                alpha=(1.0 / self.norm_factor),
             )
-            # change view to [b, np, sq, sk]
-            attention_scores = matmul_result.view(*output_size)
-            # ===========================
-            # Attention probs and dropout
-            # ===========================
-            # attention scores and attention mask [b, np, sq, sk]
-            if self.attention_softmax_in_fp32:
-                attention_scores = attention_scores.float()
-            if self.coeff is not None:
-                attention_scores = attention_scores * self.coeff
-            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
-                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
-                                            device=attention_scores.device, dtype=torch.bool)
-                attention_mask.tril_()
-                attention_mask = ~attention_mask
-            if attention_mask is not None:
-                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
-            attention_probs = F.softmax(attention_scores, dim=-1)
-            attention_probs = attention_probs.type_as(value_layer)
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.attention_dropout(attention_probs)
-            # query layer shape: [b * np, sq, hn]
-            # value layer shape: [b, np, sk, hn]
-            # attention shape: [b, np, sq, sk]
-            # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(0), value_layer.size(1), query_layer.size(1), value_layer.size(3))
-            # change view [b * np, sk, hn]
-            value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
-            # change view [b * np, sq, sk]
-            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-            # matmul: [b * np, sq, hn]
-            context_layer = torch.bmm(attention_probs, value_layer)
-            # change view [b, np, sq, hn]
-            context_layer = context_layer.view(*output_size)
-            # [b, np, sq, hn] --> [b, sq, np, hn]
-            context_layer = context_layer.transpose(1, 2).contiguous()
-            # [b, sq, np, hn] --> [b, sq, hp]
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-        return context_layer
 class SelfAttention(torch.nn.Module):
@@ -299,7 +413,7 @@ class SelfAttention(torch.nn.Module):
                                          device=device, **_config_to_kwargs(config)
                                          )
-        self.core_attention = CoreAttention(config, self.layer_number)
         # Output.
         self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
@@ -378,7 +492,8 @@ class SelfAttention(torch.nn.Module):
             value_layer = torch.cat((cache_v, value_layer), dim=2)
         if use_cache:
             if kv_cache is None:
-                kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)), dim=1)
             else:
                 kv_cache = (key_layer, value_layer)
         else:
@@ -644,12 +759,18 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
     config_class = ChatGLMConfig
     base_model_prefix = "transformer"
     _no_split_modules = ["GLMBlock"]
     def _init_weights(self, module: nn.Module):
         """Initialize the weights."""
         return
     def get_masks(self, input_ids, past_key_values, padding_mask=None):
         batch_size, seq_length = input_ids.shape
         full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
         full_attention_mask.tril_()
@@ -724,7 +845,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
                                               device=device, dtype=config.torch_dtype)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,

     SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging, is_torch_npu_available, is_flash_attn_greater_or_equal_2_10, \
+    is_flash_attn_2_available
 from transformers.generation.logits_process import LogitsProcessor
 from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
 from .configuration_chatglm import ChatGLMConfig
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 # flags required to enable jit fusion kernels
 if sys.platform != 'darwin' and not is_torch_npu_available():
 _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
 _CONFIG_FOR_DOC = "ChatGLMConfig"
 def default_init(cls, *args, **kwargs):
     return cls(*args, **kwargs)
 class CoreAttention(torch.nn.Module):
     def __init__(self, config: ChatGLMConfig, layer_number):
         super(CoreAttention, self).__init__()
+        self.config = config
         self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
         self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
         if self.apply_query_key_layer_scaling:
             self.attention_softmax_in_fp32 = True
         self.layer_number = max(1, layer_number)
+        self.is_causal = True
         projection_size = config.kv_channels * config.num_attention_heads
         self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
     def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
+        # [b, np, sq, hn] -> [b * np, sq, hn]
+        query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
+        # [b, np, sk, hn] -> [b * np, sk, hn]
+        key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = torch.empty(
+            output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+            device=query_layer.device
+        )
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer,  # [b * np, sq, hn]
+            key_layer.transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
+        )
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+        # attention scores and attention mask [b, np, sq, sk]
+        if self.attention_softmax_in_fp32:
+            attention_scores = attention_scores.float()
+        if self.coeff is not None:
+            attention_scores = attention_scores * self.coeff
+        if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+            attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                        device=attention_scores.device, dtype=torch.bool)
+            attention_mask.tril_()
+            attention_mask = ~attention_mask
+        if attention_mask is not None:
+            attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+        attention_probs = F.softmax(attention_scores, dim=-1)
+        attention_probs = attention_probs.type_as(value_layer)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+        # query layer shape: [b * np, sq, hn]
+        # value layer shape: [b, np, sk, hn]
+        # attention shape: [b, np, sq, sk]
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(0), value_layer.size(1), query_layer.size(1), value_layer.size(3))
+        # change view [b * np, sk, hn]
+        value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+        # [b, np, sq, hn] --> [b, sq, np, hn]
+        context_layer = context_layer.transpose(1, 2).contiguous()
+        # [b, sq, np, hn] --> [b, sq, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(*new_context_layer_shape)
+        return context_layer
+class SdpaAttention(CoreAttention):
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+            context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                             is_causal=True,
+                                                                             dropout_p=self.config.attention_dropout if self.training else 0.0)
         else:
+            if attention_mask is not None:
+                attention_mask = ~attention_mask
+            context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                             attention_mask,
+                                                                             dropout_p=self.config.attention_dropout if self.training else 0.0)
+        context_layer = context_layer.transpose(1, 2).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(*new_context_layer_shape)
+        return context_layer
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2
+class FlashAttention2(CoreAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(self, query_states, key_states, value_states, attention_mask):
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        batch_size, query_length = query_states.shape[:2]
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+        dropout = self.config.attention_dropout if self.training else 0.0
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
             )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=None,
+                causal=causal,
             )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=None, causal=causal
+            )
+        attn_output = attn_output.reshape(batch_size, query_length, self.hidden_size_per_partition).contiguous()
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads_per_partition, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+CORE_ATTENTION_CLASSES = {
+    "eager": CoreAttention,
+    "sdpa": SdpaAttention,
+    "flash_attention_2": FlashAttention2
+}
 class SelfAttention(torch.nn.Module):
                                          device=device, **_config_to_kwargs(config)
                                          )
+        self.core_attention = CORE_ATTENTION_CLASSES[config._attn_implementation](config, self.layer_number)
         # Output.
         self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
             value_layer = torch.cat((cache_v, value_layer), dim=2)
         if use_cache:
             if kv_cache is None:
+                kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)),
+                                     dim=1)
             else:
                 kv_cache = (key_layer, value_layer)
         else:
     config_class = ChatGLMConfig
     base_model_prefix = "transformer"
     _no_split_modules = ["GLMBlock"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
     def _init_weights(self, module: nn.Module):
         """Initialize the weights."""
         return
     def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        if self.config._attn_implementation == "flash_attention_2":
+            if padding_mask is not None and not padding_mask.all():
+                return padding_mask
+            return None
         batch_size, seq_length = input_ids.shape
         full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
         full_attention_mask.tril_()
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio,
+                                              original_impl=config.original_rope,
                                               device=device, dtype=config.torch_dtype)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,