jinaai
/

jina-bert-flash-implementation

Transformers

bert

custom_code

Inference Endpoints

🇪🇺 Region: EU

Model card Files Files and versions Community

Markus28 commited on Mar 14

Commit

3cb3930

•

1 Parent(s): 9072f7f

feat: added separate BertForMaskedLM class

Browse files

Files changed (1) hide show

modeling_bert.py +80 -0

modeling_bert.py CHANGED Viewed

@@ -689,4 +689,84 @@ class BertForPreTraining(BertPreTrainedModel):
             loss=total_loss,
             prediction_logits=prediction_scores,
             seq_relationship_logits=seq_relationship_score,
         )

             loss=total_loss,
             prediction_logits=prediction_scores,
             seq_relationship_logits=seq_relationship_score,
+        )
+class BertForMaskedLM(BertPreTrainedModel):
+    def __init__(self, config: JinaBertConfig):
+        super().__init__(config)
+        # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
+        # (around 15%) to the classifier heads.
+        self.dense_seq_output = getattr(config, "dense_seq_output", False)
+        # If last_layer_subset, we only need the compute the last layer for a subset of tokens
+        # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
+        self.last_layer_subset = getattr(config, "last_layer_subset", False)
+        if self.last_layer_subset:
+            assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
+        use_xentropy = getattr(config, "use_xentropy", False)
+        if use_xentropy and CrossEntropyLoss is None:
+            raise ImportError("xentropy_cuda is not installed")
+        loss_cls = (
+            nn.CrossEntropyLoss
+            if not use_xentropy
+            else partial(CrossEntropyLoss, inplace_backward=True)
+        )
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        self.mlm_loss = loss_cls(ignore_index=0)
+        # Initialize weights and apply final processing
+        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
+        self.tie_weights()
+    def tie_weights(self):
+        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
+    def get_input_embeddings(self):
+        return self.bert.embeddings.word_embeddings
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None
+    ):
+        masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
+        outputs = self.bert(
+            input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask.bool() if attention_mask is not None else None,
+            masked_tokens_mask=masked_tokens_mask,
+        )
+        sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
+        if self.dense_seq_output and labels is not None:
+            masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
+            if not self.last_layer_subset:
+                sequence_output = index_first_axis(
+                    rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
+                )
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+        if (
+            self.dense_seq_output and labels is not None
+        ):  # prediction_scores are already flattened
+            masked_lm_loss = self.mlm_loss(
+                prediction_scores, labels.flatten()[masked_token_idx]
+            ).float()
+        assert labels is not None
+        masked_lm_loss = self.mlm_loss(
+            rearrange(prediction_scores, "... v -> (...) v"),
+            rearrange(labels, "... -> (...)"),
+        ).float()
+        return BertForPreTrainingOutput(
+            loss=masked_lm_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
         )