|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
from torch.cuda.amp import custom_bwd, custom_fwd |
|
from transformers.models.llama.modeling_llama import LlamaMLP |
|
|
|
try: |
|
import triton |
|
import triton.language as tl |
|
from . import custom_autotune |
|
|
|
|
|
@custom_autotune.autotune( |
|
configs=[ |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 256, |
|
'BLOCK_SIZE_N': 64, |
|
'BLOCK_SIZE_K': 32, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=4, num_warps=4), |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 64, |
|
'BLOCK_SIZE_N': 256, |
|
'BLOCK_SIZE_K': 32, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=4, num_warps=4), |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 128, |
|
'BLOCK_SIZE_N': 128, |
|
'BLOCK_SIZE_K': 32, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=4, num_warps=4), |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 128, |
|
'BLOCK_SIZE_N': 64, |
|
'BLOCK_SIZE_K': 32, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=4, num_warps=4), |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 64, |
|
'BLOCK_SIZE_N': 128, |
|
'BLOCK_SIZE_K': 32, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=4, num_warps=4), |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 128, |
|
'BLOCK_SIZE_N': 32, |
|
'BLOCK_SIZE_K': 32, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=4, num_warps=4), |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 128, |
|
'BLOCK_SIZE_N': 16, |
|
'BLOCK_SIZE_K': 32, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=4, num_warps=4), |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 32, |
|
'BLOCK_SIZE_N': 32, |
|
'BLOCK_SIZE_K': 128, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=2, num_warps=4), |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 64, |
|
'BLOCK_SIZE_N': 16, |
|
'BLOCK_SIZE_K': 64, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=4, num_warps=4), |
|
triton.Config({ |
|
'BLOCK_SIZE_M': 64, |
|
'BLOCK_SIZE_N': 32, |
|
'BLOCK_SIZE_K': 64, |
|
'GROUP_SIZE_M': 8 |
|
}, num_stages=4, num_warps=4), |
|
], |
|
key=['M', 'N', 'K'], |
|
nearest_power_of_two=True, |
|
prune_configs_by={ |
|
'early_config_prune': custom_autotune.matmul248_kernel_config_pruner, |
|
'perf_model': None, |
|
'top_k': None, |
|
}, |
|
) |
|
@triton.jit |
|
def fusedmatmul_248_kernel(a_ptr, c_ptr, b1_ptr, scales1_ptr, zeros1_ptr, g1_ptr, b2_ptr, scales2_ptr, zeros2_ptr, g2_ptr, M, N, K, bits, maxq, stride_am, stride_ak, stride_bk, stride_bn, |
|
stride_cm, stride_cn, stride_scales, stride_zeros, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr): |
|
""" |
|
Computes: C = silu(A * B1) * (A * B2) |
|
A is of shape (M, K) float16 |
|
B is of shape (K//8, N) int32 |
|
C is of shape (M, N) float16 |
|
scales is of shape (1, N) float16 |
|
zeros is of shape (1, N//8) int32 |
|
""" |
|
infearure_per_bits = 32 // bits |
|
|
|
pid = tl.program_id(axis=0) |
|
num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) |
|
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) |
|
num_pid_k = tl.cdiv(K, BLOCK_SIZE_K) |
|
num_pid_in_group = GROUP_SIZE_M * num_pid_n |
|
group_id = pid // num_pid_in_group |
|
first_pid_m = group_id * GROUP_SIZE_M |
|
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) |
|
pid_m = first_pid_m + (pid % group_size_m) |
|
pid_n = (pid % num_pid_in_group) // group_size_m |
|
|
|
offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) |
|
offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) |
|
offs_k = tl.arange(0, BLOCK_SIZE_K) |
|
a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) |
|
a_mask = (offs_am[:, None] < M) |
|
|
|
b1_ptrs = b1_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn) |
|
b2_ptrs = b2_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn) |
|
g1_ptrs = g1_ptr + offs_k |
|
g2_ptrs = g2_ptr + offs_k |
|
|
|
scales1_ptrs = scales1_ptr + offs_bn[None, :] |
|
scales2_ptrs = scales2_ptr + offs_bn[None, :] |
|
zeros1_ptrs = zeros1_ptr + (offs_bn[None, :] // infearure_per_bits) |
|
zeros2_ptrs = zeros2_ptr + (offs_bn[None, :] // infearure_per_bits) |
|
|
|
shifter = (offs_k % infearure_per_bits) * bits |
|
zeros_shifter = (offs_bn % infearure_per_bits) * bits |
|
accumulator1 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) |
|
accumulator2 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) |
|
for k in range(0, num_pid_k): |
|
g1_idx = tl.load(g1_ptrs) |
|
g2_idx = tl.load(g2_ptrs) |
|
|
|
|
|
scales1 = tl.load(scales1_ptrs + g1_idx[:, None] * stride_scales) |
|
scales2 = tl.load(scales2_ptrs + g2_idx[:, None] * stride_scales) |
|
|
|
zeros1 = tl.load(zeros1_ptrs + g1_idx[:, None] * stride_zeros) |
|
zeros1 = (zeros1 >> zeros_shifter[None, :]) & maxq |
|
zeros1 = (zeros1 + 1) |
|
|
|
zeros2 = tl.load(zeros2_ptrs + g2_idx[:, None] * stride_zeros) |
|
zeros2 = (zeros2 >> zeros_shifter[None, :]) & maxq |
|
zeros2 = (zeros2 + 1) |
|
|
|
a = tl.load(a_ptrs, mask=a_mask, other=0.) |
|
b1 = tl.load(b1_ptrs) |
|
b2 = tl.load(b2_ptrs) |
|
|
|
|
|
b1 = (b1 >> shifter[:, None]) & maxq |
|
b1 = (b1 - zeros1) * scales1 |
|
accumulator1 += tl.dot(a, b1) |
|
|
|
b2 = (b2 >> shifter[:, None]) & maxq |
|
b2 = (b2 - zeros2) * scales2 |
|
accumulator2 += tl.dot(a, b2) |
|
|
|
a_ptrs += BLOCK_SIZE_K |
|
b1_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk |
|
b2_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk |
|
g1_ptrs += BLOCK_SIZE_K |
|
g2_ptrs += BLOCK_SIZE_K |
|
|
|
accumulator1 = silu(accumulator1) |
|
c = accumulator1 * accumulator2 |
|
c = c.to(tl.float16) |
|
c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :] |
|
c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N) |
|
tl.store(c_ptrs, c, mask=c_mask) |
|
|
|
@triton.jit |
|
def silu(x): |
|
return x * tl.sigmoid(x) |
|
except: |
|
print('triton not installed.') |
|
|
|
|
|
class QuantLlamaMLP(nn.Module): |
|
|
|
def __init__( |
|
self, |
|
gate_proj, |
|
down_proj, |
|
up_proj, |
|
): |
|
super().__init__() |
|
self.register_buffer('gate_proj_qweight', gate_proj.qweight) |
|
self.register_buffer('gate_proj_scales', gate_proj.scales) |
|
self.register_buffer('gate_proj_qzeros', gate_proj.qzeros) |
|
self.register_buffer('gate_proj_g_idx', gate_proj.g_idx) |
|
self.register_buffer('up_proj_qweight', up_proj.qweight) |
|
self.register_buffer('up_proj_scales', up_proj.scales) |
|
self.register_buffer('up_proj_qzeros', up_proj.qzeros) |
|
self.register_buffer('up_proj_g_idx', up_proj.g_idx) |
|
|
|
self.infeatures = gate_proj.infeatures |
|
self.intermediate_size = gate_proj.outfeatures |
|
self.outfeatures = down_proj.outfeatures |
|
self.bits = gate_proj.bits |
|
self.maxq = gate_proj.maxq |
|
|
|
self.down_proj = down_proj |
|
|
|
def forward(self, x): |
|
return self.down_proj(self.triton_llama_mlp(x)) |
|
|
|
def triton_llama_mlp(self, x): |
|
with torch.cuda.device(x.device): |
|
out_shape = x.shape[:-1] + (self.intermediate_size, ) |
|
x = x.reshape(-1, x.shape[-1]) |
|
M, K = x.shape |
|
N = self.intermediate_size |
|
c = torch.empty((M, N), device=x.device, dtype=torch.float16) |
|
grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), ) |
|
fusedmatmul_248_kernel[grid](x, c, self.gate_proj_qweight, self.gate_proj_scales, self.gate_proj_qzeros, self.gate_proj_g_idx, self.up_proj_qweight, self.up_proj_scales, |
|
self.up_proj_qzeros, self.up_proj_g_idx, M, N, K, self.bits, self.maxq, x.stride(0), x.stride(1), self.gate_proj_qweight.stride(0), |
|
self.gate_proj_qweight.stride(1), c.stride(0), c.stride(1), self.gate_proj_scales.stride(0), self.gate_proj_qzeros.stride(0)) |
|
c = c.reshape(out_shape) |
|
return c |
|
|
|
def fused2cuda(self): |
|
self.gate_proj_qweight = self.gate_proj_qweight.cuda() |
|
self.gate_proj_scales = self.gate_proj_scales.cuda() |
|
self.gate_proj_qzeros = self.gate_proj_qzeros.cuda() |
|
self.gate_proj_g_idx = self.gate_proj_g_idx.cuda() |
|
self.up_proj_qweight = self.up_proj_qweight.cuda() |
|
self.up_proj_scales = self.up_proj_scales.cuda() |
|
self.up_proj_qzeros = self.up_proj_qzeros.cuda() |
|
self.up_proj_g_idx = self.up_proj_g_idx.cuda() |
|
|
|
def fused2cpu(self): |
|
self.gate_proj_qweight = self.gate_proj_qweight.cpu() |
|
self.gate_proj_scales = self.gate_proj_scales.cpu() |
|
self.gate_proj_qzeros = self.gate_proj_qzeros.cpu() |
|
self.gate_proj_g_idx = self.gate_proj_g_idx.cpu() |
|
self.up_proj_qweight = self.up_proj_qweight.cpu() |
|
self.up_proj_scales = self.up_proj_scales.cpu() |
|
self.up_proj_qzeros = self.up_proj_qzeros.cpu() |
|
self.up_proj_g_idx = self.up_proj_g_idx.cpu() |
|
|
|
|
|
def make_fused_mlp(m, parent_name=''): |
|
""" |
|
Replace all LlamaMLP modules with QuantLlamaMLP modules, which fuses many of the operations. |
|
""" |
|
if isinstance(m, LlamaMLP): |
|
return QuantLlamaMLP(m.gate_proj, m.down_proj, m.up_proj) |
|
|
|
for name, child in m.named_children(): |
|
child = make_fused_mlp(child, parent_name=f"{parent_name}.{name}") |
|
|
|
if isinstance(child, QuantLlamaMLP): |
|
setattr(m, name, child) |
|
return m |
|
|
|
|
|
def autotune_warmup_fused(model): |
|
""" |
|
Pre-tunes the quantized kernel |
|
""" |
|
from tqdm import tqdm |
|
|
|
kn_values = {} |
|
|
|
for _, m in model.named_modules(): |
|
if not isinstance(m, QuantLlamaMLP): |
|
continue |
|
|
|
k = m.infeatures |
|
n = m.intermediate_size |
|
|
|
m.fused2cuda() |
|
if (k, n) not in kn_values: |
|
kn_values[(k, n)] = m |
|
|
|
print(f'Found {len(kn_values)} unique fused mlp KN values.') |
|
|
|
print('Warming up autotune cache ...') |
|
with torch.no_grad(): |
|
for m in tqdm(range(0, 12)): |
|
m = 2**m |
|
for (k, n), (modules) in kn_values.items(): |
|
a = torch.randn(m, k, dtype=torch.float16, device='cuda') |
|
modules.triton_llama_mlp(a) |
|
|
|
for (k, n), (modules) in kn_values.items(): |
|
a = torch.randn(m, k, dtype=torch.float16, device='cuda') |
|
modules.fused2cpu() |
|
del kn_values |
|
|