gretelai
/

mpt-7b

@@ -242,19 +242,20 @@ class MPTModel(MPTPreTrainedModel):
             if self.gradient_checkpointing and self.training:
                 def create_custom_forward(module):
-                    def custom_forward(*inputs):
                         # None for past_key_value
-                        return module(*inputs)
                     return custom_forward
                 (x, attn_weights, present) = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     x,
-                    past_key_value,
-                    attn_bias,
-                    attention_mask,
-                    self.is_causal,
                 )
             else:
                 (x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions))

             if self.gradient_checkpointing and self.training:
                 def create_custom_forward(module):
+                    def custom_forward(*inputs, **kwargs):
                         # None for past_key_value
+                        return module(*inputs, **kwargs)
                     return custom_forward
                 (x, attn_weights, present) = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     x,
+                    past_key_value=past_key_value,
+                    attn_bias=attn_bias,
+                    attention_mask=attention_mask,
+                    is_causal=self.is_causal,
+                    output_attentions=bool(output_attentions)
                 )
             else:
                 (x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions))