Upload GPTRefactForCausalLM
Browse files- config.json +1 -1
- modeling_gpt_refact.py +9 -13
- pytorch_model.bin +2 -2
config.json
CHANGED
@@ -20,7 +20,7 @@
|
|
20 |
"n_layer": 32,
|
21 |
"n_positions": 4096,
|
22 |
"scale_attention_softmax_in_fp32": true,
|
23 |
-
"torch_dtype": "
|
24 |
"transformers_version": "4.31.0",
|
25 |
"use_cache": true,
|
26 |
"vocab_size": 49216
|
|
|
20 |
"n_layer": 32,
|
21 |
"n_positions": 4096,
|
22 |
"scale_attention_softmax_in_fp32": true,
|
23 |
+
"torch_dtype": "bfloat16",
|
24 |
"transformers_version": "4.31.0",
|
25 |
"use_cache": true,
|
26 |
"vocab_size": 49216
|
modeling_gpt_refact.py
CHANGED
@@ -101,7 +101,6 @@ def get_alibi_biases(
|
|
101 |
# Multiply them pair-wise to get the AliBi bias matrix
|
102 |
biases = distance[:, :, None] * m[None, None, :]
|
103 |
biases = biases.permute(2, 0, 1)[None, :, :T, :T]
|
104 |
-
biases = biases.repeat(B, 1, 1, 1)
|
105 |
return biases.contiguous()
|
106 |
|
107 |
|
@@ -132,8 +131,7 @@ class Attention(nn.Module):
|
|
132 |
self.attention_bias_in_fp32 = config.attention_bias_in_fp32
|
133 |
|
134 |
self.q = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
|
135 |
-
self.
|
136 |
-
self.v = nn.Linear(self.embed_dim, self.head_dim, bias=False)
|
137 |
self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
|
138 |
|
139 |
def _get_mask_value(self, device, dtype):
|
@@ -200,8 +198,8 @@ class Attention(nn.Module):
|
|
200 |
Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
|
201 |
]:
|
202 |
query = self.q(hidden_states)
|
203 |
-
|
204 |
-
value = self.
|
205 |
|
206 |
if layer_past is not None:
|
207 |
past_key, past_value = layer_past
|
@@ -231,15 +229,14 @@ class MLP(nn.Module):
|
|
231 |
embed_dim = config.hidden_size
|
232 |
hidden_dim = intermediate_size
|
233 |
hidden_dim = int(2 * hidden_dim / 3)
|
234 |
-
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
235 |
-
self.
|
236 |
-
self.
|
237 |
-
self.c_proj = nn.Linear(hidden_dim, embed_dim, bias=False)
|
238 |
|
239 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
240 |
-
|
241 |
-
x2 = self.
|
242 |
-
x = self.c_proj(x1 * x2)
|
243 |
return x
|
244 |
|
245 |
|
@@ -264,7 +261,6 @@ class GPTRefactBlock(nn.Module):
|
|
264 |
self.ln_1 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
|
265 |
self.attn = Attention(config, layer_idx=layer_idx)
|
266 |
self.ln_2 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
|
267 |
-
|
268 |
self.mlp = MLP(self.inner_dim, config)
|
269 |
|
270 |
def forward(
|
|
|
101 |
# Multiply them pair-wise to get the AliBi bias matrix
|
102 |
biases = distance[:, :, None] * m[None, None, :]
|
103 |
biases = biases.permute(2, 0, 1)[None, :, :T, :T]
|
|
|
104 |
return biases.contiguous()
|
105 |
|
106 |
|
|
|
131 |
self.attention_bias_in_fp32 = config.attention_bias_in_fp32
|
132 |
|
133 |
self.q = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
|
134 |
+
self.kv = nn.Linear(self.embed_dim, self.head_dim * 2, bias=False)
|
|
|
135 |
self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
|
136 |
|
137 |
def _get_mask_value(self, device, dtype):
|
|
|
198 |
Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
|
199 |
]:
|
200 |
query = self.q(hidden_states)
|
201 |
+
kv = self.kv(hidden_states)
|
202 |
+
key, value = kv.split(self.head_dim, dim=-1)
|
203 |
|
204 |
if layer_past is not None:
|
205 |
past_key, past_value = layer_past
|
|
|
229 |
embed_dim = config.hidden_size
|
230 |
hidden_dim = intermediate_size
|
231 |
hidden_dim = int(2 * hidden_dim / 3)
|
232 |
+
self.hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
233 |
+
self.gate_up_proj = nn.Linear(embed_dim, self.hidden_dim * 2, bias=False)
|
234 |
+
self.c_proj = nn.Linear(self.hidden_dim, embed_dim, bias=False)
|
|
|
235 |
|
236 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
237 |
+
up_proj = self.gate_up_proj(x)
|
238 |
+
x1, x2 = torch.split(up_proj, self.hidden_dim, dim=-1)
|
239 |
+
x = self.c_proj(F.silu(x1) * x2)
|
240 |
return x
|
241 |
|
242 |
|
|
|
261 |
self.ln_1 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
|
262 |
self.attn = Attention(config, layer_idx=layer_idx)
|
263 |
self.ln_2 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
|
|
|
264 |
self.mlp = MLP(self.inner_dim, config)
|
265 |
|
266 |
def forward(
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6bf4dc20907069119671fdaf9f7b79d0260cd36ab94626f4af4fdd5a157d0205
|
3 |
+
size 3171755929
|