Enable CuTeDSL kernel generation (#190)

jiannanWang · web-flow · commit 61617299b4fc · 2025-10-09T13:26:15.000-07:00
diff --git a/BackendBench/backends/llm.py b/BackendBench/backends/llm.py
@@ -461,7 +461,7 @@ def _kernel_feedback_loop(
         op_name: str,
         op_signature: str,
         op_description: str,
-        framework: str = "triton",
+        dsl: str = "triton",
         attempts: int = 5,
     ) -> Tuple[str, int, bool]:
         """
@@ -473,7 +473,7 @@ def _kernel_feedback_loop(
             op_name: Name of the operation for which to generate a kernel.
             op_signature: Function signature of the operation.
             op_description: Detailed description of the operation.
-            framework: Target framework for the kernel (default: "triton").
+            dsl: Target DSL for the kernel (default: "triton").
             attempts: Maximum number of generation attempts (default: 5).
 
         Returns:
@@ -498,7 +498,7 @@ def _kernel_feedback_loop(
 
             try:
                 kernel_code = self.llm_client.generate_kernel(
-                    op_name, op_signature, op_description, framework, feedback_str
+                    op_name, op_signature, op_description, dsl, feedback_str
                 )
             except Exception as e:
                 logger.info(f"  ✗ Failed to generate kernel: {e}")
@@ -570,7 +570,7 @@ def _kernel_feedback_loop(
             best_kernel_feedback_info.is_correct,
         )
 
-    def generate_kernels(self, suite, attempts=5):
+    def generate_kernels(self, suite, attempts=5, dsl="triton"):
         """Generate kernels for all operators in the suite with comprehensive feedback."""
         successful_ops = 0
         total_ops = 0
@@ -590,6 +590,7 @@ def generate_kernels(self, suite, attempts=5):
                 op_name=op_name,
                 op_signature=f"def {op_name}(*args, **kwargs) -> torch.Tensor",
                 op_description=f"PyTorch operation: {op_name}",
+                dsl=dsl,
                 attempts=attempts,
             )
 
diff --git a/BackendBench/kernel_templates.py b/BackendBench/kernel_templates.py
@@ -11,6 +11,9 @@
 from typing import Dict
 
 from .prompts import (
+    CUTEDSL_EXAMPLE_TEMPLATES,
+    CUTEDSL_KERNEL_PROMPT,
+    CUTEDSL_OPTIMIZATIONS,
     PYTORCH_KERNEL_PROMPT,
     TRITON_EXAMPLE_TEMPLATES,
     TRITON_KERNEL_PROMPT,
@@ -21,9 +24,9 @@
 class KernelTemplate:
     """Base class for kernel templates."""
 
-    def __init__(self, name: str, framework: str):
+    def __init__(self, name: str, dsl: str):
         self.name = name
-        self.framework = framework
+        self.dsl = dsl
 
     def create_prompt(self, op_name: str, op_signature: str, op_description: str) -> str:
         """Create a prompt for kernel generation."""
@@ -76,43 +79,76 @@ def create_prompt(self, op_name: str, op_signature: str, op_description: str) ->
         )
 
 
+class CuTeDSLKernelTemplate(KernelTemplate):
+    """Template for CuTeDSL kernel generation."""
+
+    def __init__(self):
+        super().__init__("cutedsl", "cutedsl")
+
+    def create_prompt(self, op_name: str, op_signature: str, op_description: str) -> str:
+        """Create a specialized prompt for CuTeDSL kernel generation."""
+
+        # Get operation-specific optimizations
+        optimizations = self._get_optimizations(op_name)
+
+        # Get example template
+        example = self._get_example_template(op_name)
+
+        return CUTEDSL_KERNEL_PROMPT.format(
+            op_name=op_name,
+            op_signature=op_signature,
+            op_description=op_description,
+            optimizations=optimizations,
+            example=example,
+        )
+
+    def _get_optimizations(self, op_name: str) -> str:
+        """Get operation-specific optimization guidelines."""
+        return CUTEDSL_OPTIMIZATIONS.get(op_name, CUTEDSL_OPTIMIZATIONS["default"])
+
+    def _get_example_template(self, op_name: str) -> str:
+        """Get operation-specific code template."""
+        return CUTEDSL_EXAMPLE_TEMPLATES["default"]
+
+
 class KernelTemplateManager:
-    """Manages kernel templates for different frameworks."""
+    """Manages kernel templates for different dsls."""
 
     def __init__(self):
         self.templates: Dict[str, KernelTemplate] = {
             "triton": TritonKernelTemplate(),
             "pytorch": PyTorchKernelTemplate(),
+            "cutedsl": CuTeDSLKernelTemplate(),
             # TODO: Add cuda, cutile, whatever we want
         }
 
-    def get_template(self, framework: str) -> KernelTemplate:
-        """Get template for specified framework."""
-        if framework not in self.templates:
-            raise ValueError(f"Unknown framework: {framework}")
-        return self.templates[framework]
+    def get_template(self, dsl: str) -> KernelTemplate:
+        """Get template for specified dsl."""
+        if dsl not in self.templates:
+            raise ValueError(f"Unknown dsl: {dsl}")
+        return self.templates[dsl]
 
     def create_prompt(
         self,
         op_name: str,
         op_signature: str,
         op_description: str,
-        framework: str = "triton",
+        dsl: str = "triton",
     ) -> str:
         """Create a prompt using the specified template."""
-        template = self.get_template(framework)
+        template = self.get_template(dsl)
         return template.create_prompt(op_name, op_signature, op_description)
 
     def create_refinement_prompt(
         self,
         op_name: str,
         op_signature: str,
         op_description: str,
-        framework: str = "triton",
+        dsl: str = "triton",
         feedback: str = "",
     ) -> str:
         """Create a refinement prompt with feedback from previous attempts."""
-        base_prompt = self.create_prompt(op_name, op_signature, op_description, framework)
+        base_prompt = self.create_prompt(op_name, op_signature, op_description, dsl)
 
         if feedback and feedback.strip():
             refinement_prompt = f"""{feedback}
diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py
@@ -75,17 +75,15 @@ def generate_kernel(
         op_name: str,
         op_signature: str,
         op_description: str,
-        framework: str = "triton",
+        dsl: str = "triton",
         feedback: Optional[str] = None,
     ) -> str:
         if feedback:
             prompt = self.template_manager.create_refinement_prompt(
-                op_name, op_signature, op_description, framework, feedback
+                op_name, op_signature, op_description, dsl, feedback
             )
         else:
-            prompt = self.template_manager.create_prompt(
-                op_name, op_signature, op_description, framework
-            )
+            prompt = self.template_manager.create_prompt(op_name, op_signature, op_description, dsl)
 
         print("\n=== DEBUG: PROMPT SENT TO LLM RELAY ===")
         print(prompt)
diff --git a/BackendBench/prompts.py b/BackendBench/prompts.py
@@ -42,3 +42,196 @@
 }
 
 TRITON_EXAMPLE_TEMPLATES = {"default": "See main prompt for example structure."}
+
+CUTEDSL_KERNEL_PROMPT = """Generate a CuteDSL kernel for: {op_name}
+
+Operation: {op_signature}
+{op_description}
+
+Requirements:
+- CuteDSL kernel function MUST be named: {op_name}_cutedsl_kernel
+- Launcher function MUST be named: {op_name}_kernel_launch
+- Wrapper function MUST be named: {op_name}_kernel_impl
+- Use modern CuteDSL syntax with proper grid computation
+- Include all necessary imports (torch, cutlass, cutlass.cute as cute)
+
+The {op_name}_kernel_impl wrapper function MUST handle complete device management:
+- Move CPU tensors to GPU if needed (use .cuda() when torch.cuda.is_available())
+- Raise clear errors if CUDA is not available for GPU tensors
+- Call the CuteDSL kernel with GPU tensors
+- Move results back to original device of input tensors
+- Handle both args and kwargs properly
+- Preserve original tensor devices and restore them for outputs
+- Avoid falling back to PyTorch implementation
+- Avoid using try except block
+
+Generate complete, runnable code only - no framework will add device handling wrapper code.
+
+Example:
+{example}
+"""
+
+CUTEDSL_OPTIMIZATIONS = {
+    "default": "Use efficient memory access patterns and appropriate block sizes."
+}
+
+CUTEDSL_EXAMPLE_TEMPLATES = {
+    "default": """import torch
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+
+@cute.kernel
+def add_tensor_kernel(
+    gA: cute.Tensor,
+    gB: cute.Tensor,
+    gC: cute.Tensor,
+):
+    tidx, _, _ = cute.arch.thread_idx()
+    bidx, _, _ = cute.arch.block_idx()
+    bdim, _, _ = cute.arch.block_dim()
+
+    thread_idx = bidx * bdim + tidx
+
+    # Map thread index to logical index of input tensor
+    total_elements = gA.shape[0]
+    
+    # Bounds checking
+    if thread_idx < total_elements:
+
+        # Map logical index to physical address via tensor layout
+        a_val = gA[thread_idx]
+        b_val = gB[thread_idx]
+
+        # Perform element-wise addition
+        gC[thread_idx] = a_val + b_val
+
+@cute.kernel
+def add_scalar_kernel(
+    gA: cute.Tensor,
+    gC: cute.Tensor,
+    scalar_val,
+):
+    tidx, _, _ = cute.arch.thread_idx()
+    bidx, _, _ = cute.arch.block_idx()
+    bdim, _, _ = cute.arch.block_dim()
+
+    thread_idx = bidx * bdim + tidx
+
+    # Map thread index to logical index of input tensor
+    total_elements = gA.shape[0]
+    
+    # Bounds checking
+    if thread_idx < total_elements:
+
+        # Map logical index to physical address via tensor layout
+        a_val = gA[thread_idx]
+
+        # Perform element-wise addition with scalar
+        gC[thread_idx] = a_val + scalar_val
+
+@cute.jit
+def add_tensor_kernel_launch(
+    mA: cute.Tensor,
+    mB: cute.Tensor,
+    mC: cute.Tensor
+):
+    num_threads_per_block = 1024
+
+    total_elements = mA.shape[0]
+    num_blocks = (total_elements + num_threads_per_block - 1) // num_threads_per_block
+    
+    kernel = add_tensor_kernel(mA, mB, mC)
+    kernel.launch(grid=(num_blocks, 1, 1),
+                  block=(num_threads_per_block, 1, 1))
+
+@cute.jit
+def add_scalar_kernel_launch(
+    mA: cute.Tensor,
+    mC: cute.Tensor,
+    scalar_val
+):
+    num_threads_per_block = 1024
+
+    total_elements = mA.shape[0]
+    num_blocks = (total_elements + num_threads_per_block - 1) // num_threads_per_block
+    
+    kernel = add_scalar_kernel(mA, mC, scalar_val)
+    kernel.launch(grid=(num_blocks, 1, 1),
+                  block=(num_threads_per_block, 1, 1))
+
+def add_kernel_impl(*args, **kwargs):
+    
+    # Handle both positional and keyword arguments
+    if len(args) >= 2:
+        input_tensor = args[0]
+        other = args[1]
+    elif len(args) == 1 and 'other' in kwargs:
+        input_tensor = args[0]
+        other = kwargs['other']
+    elif 'input' in kwargs and 'other' in kwargs:
+        input_tensor = kwargs['input']
+        other = kwargs['other']
+    else:
+        raise ValueError("add requires 'input' and 'other' arguments")
+    
+    if torch.is_tensor(other):
+        input_tensor, other = torch.broadcast_tensors(input_tensor, other)
+    
+    if 'alpha' in kwargs:
+        alpha = kwargs['alpha']
+        other = other * alpha
+    
+    # Remember original device
+    original_device = input_tensor.device
+
+    # Flatten all tensors and save their shapes
+    original_shape = input_tensor.shape
+    input_tensor = input_tensor.flatten()
+    if torch.is_tensor(other):
+        other = other.flatten()
+    
+    # Move to GPU if needed
+    if not input_tensor.is_cuda:
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available")
+        input_tensor = input_tensor.cuda()
+    
+    # Check if other is a tensor or scalar
+    if torch.is_tensor(other):
+        # Tensor + Tensor case
+        if not other.is_cuda:
+            if not torch.cuda.is_available():
+                raise RuntimeError("CUDA is not available")
+            other = other.cuda()
+        
+        output = torch.empty_like(input_tensor)
+        a_ = from_dlpack(input_tensor)
+        b_ = from_dlpack(other)
+        c_ = from_dlpack(output)
+
+        add_tensor_kernel_launch_ = cute.compile(add_tensor_kernel_launch, a_, b_, c_)
+        add_tensor_kernel_launch_(a_, b_, c_)
+    else:
+        # Tensor + Scalar case
+        # Convert scalar to Python float
+        if hasattr(other, 'item'):
+            scalar_val = other.item()
+        else:
+            scalar_val = other
+        
+        output = torch.empty_like(input_tensor)
+        a_ = from_dlpack(input_tensor)
+        c_ = from_dlpack(output)
+
+        add_scalar_kernel_launch_ = cute.compile(add_scalar_kernel_launch, a_, c_, scalar_val)
+        add_scalar_kernel_launch_(a_, c_, scalar_val)
+    
+    # Move result back to original device
+    if original_device != output.device:
+        output = output.to(original_device)
+    
+    output = output.reshape(original_shape)
+    
+    return output"""
+}
diff --git a/BackendBench/scripts/main.py b/BackendBench/scripts/main.py