Upload triton_flash_blocksparse_attn.py

Moved tensors to the device. Solve issues:
* https://huggingface.co/microsoft/Phi-3-small-128k-instruct/discussions/19

Based on a discussion:
* https://huggingface.co/microsoft/Phi-3-small-128k-instruct/commit/ed7de9a074b0760e6cf050fe1d103b90834933c8

Files changed (1) hide show

triton_flash_blocksparse_attn.py +58 -56

triton_flash_blocksparse_attn.py CHANGED Viewed

@@ -611,30 +611,31 @@ def _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BL
     # print(f'> {q.shape=}, {k.shape=}, {layout_crow_indices.shape}, {layout_col_indices.shape}, {layout_crow_indices.stride()}, \
     #   {layout_col_indices.stride()}, {layout_crow_indices=}, {layout_col_indices=}')
-    _fwd_kernel[grid](
-        q, k, v, sm_scale,
-        layout_crow_indices,
-        layout_col_indices,
-        layout_crow_indices.stride(0), layout_crow_indices.stride(1),
-        layout_col_indices.stride(0), layout_col_indices.stride(1),
-        tmp, L, m,
-        o,
-        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
-        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
-        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
-        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
-        q.shape[0], q.shape[1], k.shape[2],
-        k.shape[2] - q.shape[2],
-        q_rounded_len,
-        BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
-        BLOCK_DMODEL=BLOCK_DMODEL,
-        EVEN_M_BLOCK=q.shape[2] % BLOCK_M == 0,
-        EVEN_N_BLOCK=k.shape[2] % BLOCK_N == 0 ,
-        INFERENCE=inference,
-        NUM_DBLOCKS=q.shape[-1] // BLOCK_DMODEL,
-        num_warps=num_warps,
-        num_stages=num_stages,
-    )
     if inference:
         L, m = None, None
@@ -991,37 +992,38 @@ def blocksparse_flash_attn_padded_fwd(
     grid = (len(q_start_sids), n_heads)
-    _fwd_kernel_batch_inference[grid](
-    q, k, v, out,
-    sm_scale,
-    q_batch_starts,
-    q_batch_ends,
-    k_batch_starts,
-    k_batch_ends,
-    q_batch_ids,
-    q_start_sids,
-    *q.stride(),
-    *k.stride(),
-    *v.stride(),
-    *out.stride(),
-    layout_crow_indices,
-    layout_col_indices,
-    *layout_crow_indices.stride(),
-    *layout_col_indices.stride(),
-    q_k_ratio,
-    HAS_BATCH_DIM = True,
-    D_HEAD = head_size,
-    BLOCK_M = block_size,
-    BLOCK_N = block_size,
-    BLOCK_D = block_d,
-    BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
-    EVEN_D = block_d == head_size,
-    num_warps = 1 if q_len == 1 else 4,
-    num_stages = 3
-    )
     return out
@@ -1940,4 +1942,4 @@ if __name__ == '__main__':
 # 4   4096.0      3.401622     6.221376       1.636039
 # 5   8192.0     11.915136    23.483391       3.968725
 # 6  16384.0     44.660225    91.302910      10.857130
-# 7  32768.0    175.038467   359.048187      32.778240

     # print(f'> {q.shape=}, {k.shape=}, {layout_crow_indices.shape}, {layout_col_indices.shape}, {layout_crow_indices.stride()}, \
     #   {layout_col_indices.stride()}, {layout_crow_indices=}, {layout_col_indices=}')
+    with torch.cuda.device(q.device.index):
+        _fwd_kernel[grid](
+            q, k, v, sm_scale,
+            layout_crow_indices,
+            layout_col_indices,
+            layout_crow_indices.stride(0), layout_crow_indices.stride(1),
+            layout_col_indices.stride(0), layout_col_indices.stride(1),
+            tmp, L, m,
+            o,
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+            o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+            q.shape[0], q.shape[1], k.shape[2],
+            k.shape[2] - q.shape[2],
+            q_rounded_len,
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL=BLOCK_DMODEL,
+            EVEN_M_BLOCK=q.shape[2] % BLOCK_M == 0,
+            EVEN_N_BLOCK=k.shape[2] % BLOCK_N == 0 ,
+            INFERENCE=inference,
+            NUM_DBLOCKS=q.shape[-1] // BLOCK_DMODEL,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
     if inference:
         L, m = None, None
     grid = (len(q_start_sids), n_heads)
+    with torch.cuda.device(q.device.index):
+        _fwd_kernel_batch_inference[grid](
+            q, k, v, out,
+            sm_scale,
+            q_batch_starts,
+            q_batch_ends,
+            k_batch_starts,
+            k_batch_ends,
+            q_batch_ids,
+            q_start_sids,
+            *q.stride(),
+            *k.stride(),
+            *v.stride(),
+            *out.stride(),
+            layout_crow_indices,
+            layout_col_indices,
+            *layout_crow_indices.stride(),
+            *layout_col_indices.stride(),
+            q_k_ratio,
+            HAS_BATCH_DIM = True,
+            D_HEAD = head_size,
+            BLOCK_M = block_size,
+            BLOCK_N = block_size,
+            BLOCK_D = block_d,
+            BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
+            EVEN_D = block_d == head_size,
+            num_warps = 1 if q_len == 1 else 4,
+            num_stages = 3
+        )
     return out
 # 4   4096.0      3.401622     6.221376       1.636039
 # 5   8192.0     11.915136    23.483391       3.968725
 # 6  16384.0     44.660225    91.302910      10.857130
+# 7  32768.0    175.038467   359.048187      32.778240