Add cutlass decode kernel to TritonBench

Aya-ZIbra · facebook-github-bot · commit ae2323f695c8 · 2025-08-28T15:38:21.000-07:00
Summary: as title

Differential Revision: D80041532
diff --git a/tritonbench/operators/decoding_attention/operator.py b/tritonbench/operators/decoding_attention/operator.py
@@ -71,6 +71,18 @@
 except (ImportError, IOError, AttributeError):
     HAS_AITER = False
 
+# [Optional] cutlass_blackwell_fmha backend
+HAS_CUTLASS_BLACKWELL = True
+try:
+    from ai_acceleration.kernels.attentions.cutlass_blackwell_fmha.cutlass_blackwell_fmha_interface import (
+        cutlass_blackwell_fmha_func,
+    )
+    # Disable FA3 for Blackwell as it doesn't work properly
+    HAS_FLASH_V3 = False
+    # Note: We keep FA2 and triton enabled alongside Blackwell for comparison
+except (ImportError, IOError, AttributeError):
+    HAS_CUTLASS_BLACKWELL = False
+
 
 def parse_op_args(args: List[str]):
     parser = argparse.ArgumentParser()
@@ -559,6 +571,34 @@ def fbgemm_gqa_fp8kv(
             cache_logical_dtype_int=1,  # FP8 = 1
         )
 
+    @register_benchmark(enabled=HAS_CUTLASS_BLACKWELL)
+    def cutlass_blackwell_fmha_decode(
+        self,
+        q: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+    ) -> Callable:
+        seq_len_q = q.shape[1]
+        
+        # Cutlass Blackwell FMHA currently supports decode case (seq_len_q == 1)
+        if seq_len_q != 1:
+            # Skip non-decode cases for now
+            raise NotImplementedError("Cutlass Blackwell FMHA only supports decode case")
+            # return lambda: q.new_zeros(q.shape)
+        
+        # Convert to fp8 format as required by the decode path
+        _q = q.to(torch.float8_e4m3fn)
+        _k_cache = k_cache.to(torch.float8_e4m3fn)  
+        _v_cache = v_cache.to(torch.float8_e4m3fn)
+        
+        # Create seqlen_kv tensor for generation phase
+        seqlen_kv = cache_seqlens.to(dtype=torch.int32, device=q.device)
+
+        return lambda: cutlass_blackwell_fmha_func(
+            _q, _k_cache, _v_cache, causal=CAUSAL, seqlen_kv=seqlen_kv
+        )
+
     @register_benchmark(enabled=HAS_AITER)
     def aiter_paged_fp8kv(
         self,