feat: adapter masking wip

Browse files

Signed-off-by: Meow <ongjackm@gmail.com>

Files changed (4) hide show

embedding.py +21 -4
modeling_lora.py +10 -2
modeling_xlm_roberta.py +7 -6
xlm_padding.py +9 -1

embedding.py CHANGED Viewed

@@ -40,7 +40,7 @@ class XLMRobertaEmbeddings(nn.Module):
         if self.type_vocab_size > 0:
             self.token_type_embeddings = nn.Embedding(type_vocab_size, embed_dim, **factory_kwargs)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, task_type=None):
         """
         input_ids: (batch, seqlen)
         position_ids: (batch, seqlen)
@@ -55,9 +55,25 @@ class XLMRobertaEmbeddings(nn.Module):
             emb1 = self.word_embeddings(tensor1, task_type=task_type[0])
             emb2 = self.word_embeddings(tensor2, task_type=task_type[1])
             embeddings = torch.cat((emb1, emb2), dim=0)
         else:
-            lora_kwargs = {'task_type': task_type} if task_type is not None else {}
-            embeddings = self.word_embeddings(input_ids, **lora_kwargs)
         if self.max_position_embeddings > 0:
             if position_ids is None:
@@ -79,7 +95,8 @@ class XLMRobertaEmbeddings(nn.Module):
                 emb2 = emb2 + token_type_embs2
                 embeddings = torch.cat((emb1, emb2), dim=0)
             else:
-                lora_kwargs = {'task_type': task_type} if task_type is not None else {}
                 token_type_embeddings = self.token_type_embeddings(token_type_ids, **lora_kwargs)
                 embeddings = embeddings + token_type_embeddings
         return embeddings

         if self.type_vocab_size > 0:
             self.token_type_embeddings = nn.Embedding(type_vocab_size, embed_dim, **factory_kwargs)
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, task_type=None, adapter_mask=None):
         """
         input_ids: (batch, seqlen)
         position_ids: (batch, seqlen)
             emb1 = self.word_embeddings(tensor1, task_type=task_type[0])
             emb2 = self.word_embeddings(tensor2, task_type=task_type[1])
             embeddings = torch.cat((emb1, emb2), dim=0)
+            unique_tasks = torch.unique(adapter_mask).tolist()
+            torch_dtype = next(self.word_embeddings.parameters()).dtype
+            embeddings = torch.empty(*input_ids.shape, self.word_embeddings.embedding_dim, dtype=torch_dtype).to(input_ids.device)
+            for task in unique_tasks:
+                indices = (adapter_mask == task).nonzero(as_tuple=True)[0]
+                inp = input_ids[indices]
+                lora_kwargs = {'task_type': task} if task is not None else {}
+                emb = self.word_embeddings(inp, **lora_kwargs)
+                embeddings[indices] = emb
+            exit(0)
         else:
+            unique_task = torch.unique(adapter_mask)[0]
+            task1_indices = (adapter_mask == unique_task).nonzero(as_tuple=True)[0]
+            input1 = input_ids[task1_indices]
+            lora_kwargs = {'task_type': unique_task} if unique_task is not None else {}
+            embeddings = self.word_embeddings(input1, **lora_kwargs)
         if self.max_position_embeddings > 0:
             if position_ids is None:
                 emb2 = emb2 + token_type_embs2
                 embeddings = torch.cat((emb1, emb2), dim=0)
             else:
+                unique_task = torch.unique(adapter_mask)[0]
+                lora_kwargs = {'task_type': unique_task} if unique_task is not None else {}
                 token_type_embeddings = self.token_type_embeddings(token_type_ids, **lora_kwargs)
                 embeddings = embeddings + token_type_embeddings
         return embeddings

modeling_lora.py CHANGED Viewed

@@ -177,7 +177,11 @@ class LoRAParametrization(nn.Module):
             )
             def new_forward(self, input, task_type, residual=False):
-                task_idx = adaptation_map[task_type] if task_type else None
                 if task_idx is not None:
                     weights = self.parametrizations.weight[0].lora_forward(self.weight, current_task=task_idx)
                 else:
@@ -205,7 +209,11 @@ class LoRAParametrization(nn.Module):
             )
             def new_forward(self, input, task_type):
-                task_idx = adaptation_map[task_type] if task_type else None
                 if task_idx is not None:
                     weights = self.parametrizations.weight[0].lora_forward(self.weight, current_task=task_idx)
                 else:

             )
             def new_forward(self, input, task_type, residual=False):
+                if isinstance(task_type, str):
+                    task_idx = adaptation_map[task_type] if task_type else None
+                else:
+                    task_idx = task_type
                 if task_idx is not None:
                     weights = self.parametrizations.weight[0].lora_forward(self.weight, current_task=task_idx)
                 else:
             )
             def new_forward(self, input, task_type):
+                if isinstance(task_type, str):
+                    task_idx = adaptation_map[task_type] if task_type else None
+                else:
+                    task_idx = task_type
                 if task_idx is not None:
                     weights = self.parametrizations.weight[0].lora_forward(self.weight, current_task=task_idx)
                 else:

modeling_xlm_roberta.py CHANGED Viewed

@@ -204,7 +204,7 @@ class XLMRobertaEncoder(nn.Module):
     def gradient_checkpointing(self, value):
         self._grad_checkpointing = value
-    def forward(self, hidden_states, key_padding_mask=None, subset_mask=None, task_type=None):
         """If subset_mask is not None, we only want output for the subset of the sequence.
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
@@ -230,10 +230,10 @@ class XLMRobertaEncoder(nn.Module):
                 hidden_states = hidden_states[subset_mask]
         else:
             batch, seqlen = hidden_states.shape[:2]
-            hidden_states, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
-                hidden_states, key_padding_mask
             )
-            mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch, "task_type": task_type}
             if subset_mask is None:
                 for layer in self.layers:
                     if self._grad_checkpointing:
@@ -649,6 +649,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         masked_tokens_mask: (batch, seqlen), dtype=torch.bool
         """
         task_type = kwargs.pop('task_type', None)
         if kwargs:
             for key, value in kwargs.items():
                 if value is not None:
@@ -662,7 +663,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         )
         hidden_states = self.embeddings(
-            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, task_type=task_type
         )
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
@@ -686,7 +687,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             subset_mask = None
         sequence_output = self.encoder(
-            hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask, task_type=task_type
         )
         if masked_tokens_mask is None:

     def gradient_checkpointing(self, value):
         self._grad_checkpointing = value
+    def forward(self, hidden_states, key_padding_mask=None, subset_mask=None, task_type=None, adapter_mask=None):
         """If subset_mask is not None, we only want output for the subset of the sequence.
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
                 hidden_states = hidden_states[subset_mask]
         else:
             batch, seqlen = hidden_states.shape[:2]
+            hidden_states, indices, cu_seqlens, max_seqlen_in_batch, cu_adapter_mask = unpad_input(
+                hidden_states, key_padding_mask, adapter_mask
             )
+            mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch, "task_type": task_type, "cu_adapter_mask": cu_adapter_mask}
             if subset_mask is None:
                 for layer in self.layers:
                     if self._grad_checkpointing:
         masked_tokens_mask: (batch, seqlen), dtype=torch.bool
         """
         task_type = kwargs.pop('task_type', None)
+        adapter_mask = kwargs.pop('adapter_mask', None)
         if kwargs:
             for key, value in kwargs.items():
                 if value is not None:
         )
         hidden_states = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, task_type=task_type, adapter_mask=adapter_mask
         )
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
             subset_mask = None
         sequence_output = self.encoder(
+            hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask, task_type=task_type, adapter_mask=adapter_mask
         )
         if masked_tokens_mask is None:

xlm_padding.py CHANGED Viewed

@@ -98,7 +98,7 @@ class IndexFirstAxisResidual(torch.autograd.Function):
 index_first_axis_residual = IndexFirstAxisResidual.apply
-def unpad_input(hidden_states, attention_mask):
     """
     Arguments:
         hidden_states: (batch, seqlen, ...)
@@ -113,6 +113,13 @@ def unpad_input(hidden_states, attention_mask):
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
     cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
     # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
@@ -123,6 +130,7 @@ def unpad_input(hidden_states, attention_mask):
         indices,
         cu_seqlens,
         max_seqlen_in_batch,
     )

 index_first_axis_residual = IndexFirstAxisResidual.apply
+def unpad_input(hidden_states, attention_mask, adapter_mask):
     """
     Arguments:
         hidden_states: (batch, seqlen, ...)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
     cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    cu_adapter_mask = torch.empty(cu_seqlens[-1], dtype=torch.int32)
+    for i in range(len(adapter_mask)):
+        start_idx = cu_seqlens[i]
+        end_idx = cu_seqlens[i + 1]
+        cu_adapter_mask[start_idx:end_idx] = adapter_mask[i]
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
     # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
         indices,
         cu_seqlens,
         max_seqlen_in_batch,
+        cu_adapter_mask,
     )