jinaai
/

jina-bert-flash-implementation

Transformers

bert

custom_code

Inference Endpoints

🇪🇺 Region: EU

Model card Files Files and versions Community

Markus28 commited on Feb 22

Commit

3160695

•

1 Parent(s): ed92835

feat: reverted monkey patch

Browse files

Files changed (2) hide show

configuration_bert.py +0 -2
modeling_bert.py +5 -17

configuration_bert.py CHANGED Viewed

@@ -14,8 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ BERT model configuration"""
-from collections import OrderedDict
-from typing import Mapping
 from transformers import PretrainedConfig

 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ BERT model configuration"""
 from transformers import PretrainedConfig

modeling_bert.py CHANGED Viewed

@@ -28,16 +28,13 @@ from transformers.models.bert.modeling_bert import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     BertForPreTrainingOutput,
 )
-from .patched_padding_bert import index_first_axis as index_first_axis_monkey_patch
-import flash_attn.bert_padding
-flash_attn.bert_padding.index_first_axis = index_first_axis_monkey_patch
-"""
 from flash_attn.bert_padding import (
     index_first_axis_residual,
     pad_input,
     unpad_input,
 )
-"""
 from flash_attn.modules.block import Block
 from flash_attn.modules.embedding import BertEmbeddings
 from flash_attn.modules.mha import MHA
@@ -176,14 +173,14 @@ class BertEncoder(nn.Module):
                 hidden_states = hidden_states[subset_mask]
         else:
             batch, seqlen = hidden_states.shape[:2]
-            hidden_states, indices, cu_seqlens, max_seqlen_in_batch = flash_attn.bert_padding.unpad_input(
                 hidden_states, key_padding_mask
             )
             mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
             if subset_mask is None:
                 for layer in self.layers:
                     hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-                hidden_states = flash_attn.bert_padding.pad_input(hidden_states, indices, batch, seqlen)
             else:
                 for layer in self.layers[:-1]:
                     hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
@@ -201,7 +198,7 @@ class BertEncoder(nn.Module):
                     subset_cu_seqlens = F.pad(
                         torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
                     )
-                hidden_states_subset, hidden_states = flash_attn.bert_padding.index_first_axis_residual(
                     hidden_states, subset_idx
                 )
                 # It's ok to set max_seqlen_q to be much larger
@@ -425,15 +422,6 @@ class BertModel(BertPreTrainedModel):
             pooler_output=pooled_output,
         )
-    def to(self, *args, **kwargs):
-        print(f'In BERT, calling to({args, kwargs})')
-        result = super().to(*args, **kwargs)
-        if (len(args) > 0 and isinstance(args[0], torch.dtype)) or "dtype" in kwargs:
-            for layer in result.encoder.layers:
-                layer.mixer.inner_cross_attn.alibi_slopes = layer.mixer.inner_cross_attn.alibi_slopes.to(torch.float32)
-                layer.mixer.inner_attn.alibi_slopes = layer.mixer.inner_attn.alibi_slopes.to(torch.float32)
-        return result
 class BertForPreTraining(BertPreTrainedModel):
     def __init__(self, config: JinaBertConfig):

     BaseModelOutputWithPoolingAndCrossAttentions,
     BertForPreTrainingOutput,
 )
 from flash_attn.bert_padding import (
+    index_first_axis,
     index_first_axis_residual,
     pad_input,
     unpad_input,
 )
 from flash_attn.modules.block import Block
 from flash_attn.modules.embedding import BertEmbeddings
 from flash_attn.modules.mha import MHA
                 hidden_states = hidden_states[subset_mask]
         else:
             batch, seqlen = hidden_states.shape[:2]
+            hidden_states, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
                 hidden_states, key_padding_mask
             )
             mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
             if subset_mask is None:
                 for layer in self.layers:
                     hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+                hidden_states = pad_input(hidden_states, indices, batch, seqlen)
             else:
                 for layer in self.layers[:-1]:
                     hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                     subset_cu_seqlens = F.pad(
                         torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
                     )
+                hidden_states_subset, hidden_states = index_first_axis_residual(
                     hidden_states, subset_idx
                 )
                 # It's ok to set max_seqlen_q to be much larger
             pooler_output=pooled_output,
         )
 class BertForPreTraining(BertPreTrainedModel):
     def __init__(self, config: JinaBertConfig):