From fd9109c0fe9a9a18e645c24b88c1ccc4406673b2 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Tue, 28 Oct 2025 14:08:50 -0700
Subject: [PATCH 01/19] add loadinline to support cuda kernel

---
 BackendBench/backends/directory.py            | 125 +++++++++-
 .../scripts/create_simple_test_cuda_ops.py    |  57 +++++
 test/test_directory_backend.py                | 225 ++++++++++++------
 3 files changed, 332 insertions(+), 75 deletions(-)
 create mode 100644 BackendBench/scripts/create_simple_test_cuda_ops.py

diff --git a/BackendBench/backends/directory.py b/BackendBench/backends/directory.py
index e58c5a6c..38aa1174 100644
--- a/BackendBench/backends/directory.py
+++ b/BackendBench/backends/directory.py
@@ -9,6 +9,8 @@
 import os
 from typing import Callable, Dict
 
+from torch.utils.cpp_extension import load_inline
+
 from ..scripts.op_map import query
 from ..utils import get_pytorch_op
 from .base import Backend
@@ -52,7 +54,8 @@ def _load_kernels(self):
             impl_files = [
                 f
                 for f in os.listdir(op_dir)
-                if f.endswith(".py") and f.startswith(f"{op_name}_implementation")
+                if (f.endswith(".py") or f.endswith(".cu") or f.endswith(".cpp"))
+                and f.startswith(f"{op_name}_implementation")
             ]
             if not impl_files:
                 logger.debug(f"No implementation files found in {op_dir}")
@@ -69,6 +72,7 @@ def _load_kernels(self):
                     for variant_info in op_variants:
                         op_full_name = variant_info["op"]
                         pytorch_op = get_pytorch_op(op_full_name)
+                        print(f"pytorch_op: {pytorch_op}")
                         if pytorch_op:
                             self.compiled_kernels[pytorch_op] = kernel_func
                             logger.info(f"Loaded {op_name} from {impl_file} -> {op_full_name}")
@@ -82,13 +86,9 @@ def _load_kernels(self):
 
         logger.info(f"DirectoryBackend loaded {loaded_count} kernels from {self.ops_dir}/")
 
-    def _load_kernel_from_file(self, file_path: str, op_name: str) -> Callable:
+    def _load_python_kernel(self, file_path: str, op_name: str) -> Callable:
         """
-        Dynamically load a kernel implementation function from a Python file.
-
-        Each operator directory should contain implementation files that export a function
-        named {op_name}_kernel_impl. This function becomes the kernel implementation
-        that gets registered for all variants of the operator.
+        Load a kernel implementation from a Python file.
 
         Args:
             file_path: Path to the Python implementation file
@@ -110,6 +110,117 @@ def _load_kernel_from_file(self, file_path: str, op_name: str) -> Callable:
         else:
             raise ValueError(f"No function named {kernel_func_name} found in {file_path}")
 
+    def setup_cpp_extension(self):
+        global CPP_EXTENSION_IS_SETUP
+        if not CPP_EXTENSION_IS_SETUP:
+            from setuptools import setup
+            from torch.utils import cpp_extension
+
+            setup(
+                name="extension_cpp",
+                ext_modules=[
+                    cpp_extension.CppExtension(
+                        "extension_cpp",
+                        ["muladd.cpp"],
+                        # define Py_LIMITED_API with min version 3.9 to expose only the stable
+                        # limited API subset from Python.h
+                        extra_compile_args={"cxx": ["-DPy_LIMITED_API=0x03090000"]},
+                        py_limited_api=True,
+                    )
+                ],  # Build 1 wheel across multiple Python versions
+                cmdclass={"build_ext": cpp_extension.BuildExtension},
+                options={
+                    "bdist_wheel": {"py_limited_api": "cp39"}
+                },  # 3.9 is minimum supported Python version
+            )
+            CPP_EXTENSION_IS_SETUP = True
+
+    def _load_cuda_kernel(self, file_path: str, op_name: str) -> Callable:
+        """
+        Load and compile a kernel implementation from CUDA files using load_inline.
+
+        Args:
+            file_path: Path to the CUDA implementation file (.cu or .cpp)
+            op_name: Base name of the operator (e.g., "add__Tensor")
+
+        Returns:
+            Callable kernel implementation function
+
+        Raises:
+            ValueError: If the expected kernel function is not found in the compiled module
+        """
+        file_dir = os.path.dirname(file_path)
+        file_name = os.path.basename(file_path)
+        base_name = file_name.rsplit(".", 1)[0]
+
+        cu_file = os.path.join(file_dir, f"{base_name}.cu")
+        cpp_file = os.path.join(file_dir, f"{base_name}.cpp")
+
+        cpp_source = ""
+        cuda_source = ""
+
+        # Read both files if they exist
+        if os.path.exists(cu_file):
+            with open(cu_file, "r") as f:
+                cuda_source = f.read()
+        print(f"cuda_source: {cuda_source}")
+
+        if os.path.exists(cpp_file):
+            with open(cpp_file, "r") as f:
+                cpp_source = f.read()
+        print(f"cpp_source: {cpp_source}")
+
+        # Use load_inline for all cases
+        module_name = f"{op_name}_cuda_inline"
+        cuda_module = load_inline(
+            name=module_name,
+            cpp_sources=cpp_source,
+            cuda_sources=cuda_source,
+            functions=[op_name],
+            verbose=True,
+        )
+
+        # x = torch.randn(4, 4, device="cuda", dtype=torch.float32)
+        # y = torch.randn(4, 4, device="cuda", dtype=torch.float32)
+
+        # print(getattr(cuda_module, op_name)(x, y))
+
+        if hasattr(cuda_module, op_name):
+            return getattr(cuda_module, op_name)
+        else:
+            raise ValueError(
+                f"No function named {op_name} found in compiled CUDA module from {file_path}"
+            )
+
+    def _load_kernel_from_file(self, file_path: str, op_name: str) -> Callable:
+        """
+        Dynamically load a kernel implementation function from a Python or CUDA file.
+
+        Dispatches to the appropriate loader based on file extension:
+        - .py files -> _load_python_kernel
+        - .cu or .cpp files -> _load_cuda_kernel
+
+        Args:
+            file_path: Path to the implementation file (Python or CUDA)
+            op_name: Base name of the operator (e.g., "add", "mul", "conv2d")
+
+        Returns:
+            Callable kernel implementation function
+
+        Raises:
+            ValueError: If the file extension is unsupported or the kernel function is not found
+        """
+        file_ext = os.path.splitext(file_path)[1]
+
+        if file_ext == ".py":
+            return self._load_python_kernel(file_path, op_name)
+        elif file_ext in [".cu", ".cpp"]:
+            return self._load_cuda_kernel(file_path, op_name)
+        else:
+            raise ValueError(
+                f"Unsupported file extension {file_ext} for {file_path}. Expected .py, .cu, or .cpp"
+            )
+
     def __getitem__(self, key):
         if key in self.compiled_kernels:
             return self.compiled_kernels[key]
diff --git a/BackendBench/scripts/create_simple_test_cuda_ops.py b/BackendBench/scripts/create_simple_test_cuda_ops.py
new file mode 100644
index 00000000..7afa28bd
--- /dev/null
+++ b/BackendBench/scripts/create_simple_test_cuda_ops.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Create simple kernel implementations for 5 common operations.
+Each just calls the original PyTorch function.
+"""
+
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+
+def create_add():
+    os.makedirs("generated_kernels_cuda/add", exist_ok=True)
+    with open("generated_kernels_cuda/add/add_implementation_v1.cu", "w") as f:
+        f.write("""
+__global__ void add_kernel(
+    const float* __restrict__ x,
+    const float* __restrict__ y,
+    float* __restrict__ output,
+    const int size) {
+    const auto index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size) {
+    output[index] = x[index] + y[index];
+    }
+}
+
+torch::Tensor add(torch::Tensor x, torch::Tensor y) {
+    auto output = torch::zeros_like(x);
+    const int threads = 1024;
+    const int blocks = (output.numel() + threads - 1) / threads;
+    add_kernel<<<blocks, threads>>>(x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
+    return output;
+}
+""")
+    with open("generated_kernels_cuda/add/add_implementation_v1.cpp", "w") as f:
+        f.write("""torch::Tensor add(torch::Tensor x, torch::Tensor y);""")
+    logger.info("Created add implementation")
+
+
+def main():
+    """Create 5 simple test operations."""
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    logger.info("Creating cuda kernel implementations for testing...")
+
+    create_add()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py
index f5662fc6..f132b28e 100644
--- a/test/test_directory_backend.py
+++ b/test/test_directory_backend.py
@@ -19,100 +19,189 @@
 from BackendBench.backends import DirectoryBackend
 
 
-@pytest.fixture(scope="module")
-def backend():
-    # Always create correct test implementations, overriding any watermarked ones
-    import subprocess
+class TestDirectoryBackend:
+    @pytest.fixture(scope="class")
+    def backend(self):
+        # Always create correct test implementations, overriding any watermarked ones
+        import subprocess
 
-    subprocess.run(
-        [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True
-    )
+        subprocess.run(
+            [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True
+        )
 
-    return DirectoryBackend(ops_dir="generated_kernels")
+        return DirectoryBackend(ops_dir="generated_kernels")
 
+    def test_relu_operation(self, backend):
+        relu_op = torch.ops.aten.relu.default
+        assert relu_op in backend
 
-def test_relu_operation(backend):
-    relu_op = torch.ops.aten.relu.default
-    assert relu_op in backend
+        our_impl = backend[relu_op]
+        x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+        result = our_impl(x)
+        expected = relu_op(x)
 
-    our_impl = backend[relu_op]
-    x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
-    result = our_impl(x)
-    expected = relu_op(x)
+        assert torch.allclose(result, expected)
 
-    assert torch.allclose(result, expected)
+    def test_add_operation(self, backend):
+        add_op = torch.ops.aten.add.Tensor
+        assert add_op in backend
 
+        our_impl = backend[add_op]
+        a = torch.tensor([1.0, 2.0, 3.0])
+        b = torch.tensor([4.0, 5.0, 6.0])
+        result = our_impl(a, b)
+        expected = add_op(a, b)
+        print(f"result: {result}, expected: {expected}")
 
-def test_add_operation(backend):
-    add_op = torch.ops.aten.add.Tensor
-    assert add_op in backend
+        assert torch.allclose(result, expected)
 
-    our_impl = backend[add_op]
-    a = torch.tensor([1.0, 2.0, 3.0])
-    b = torch.tensor([4.0, 5.0, 6.0])
-    result = our_impl(a, b)
-    expected = add_op(a, b)
+    def test_mul_operation(self, backend):
+        mul_op = torch.ops.aten.mul.Tensor
+        assert mul_op in backend
 
-    assert torch.allclose(result, expected)
+        our_impl = backend[mul_op]
+        a = torch.tensor([1.0, 2.0, 3.0])
+        b = torch.tensor([4.0, 5.0, 6.0])
+        result = our_impl(a, b)
+        expected = mul_op(a, b)
 
+        assert torch.allclose(result, expected)
 
-def test_mul_operation(backend):
-    mul_op = torch.ops.aten.mul.Tensor
-    assert mul_op in backend
+    def test_abs_operation(self, backend):
+        abs_op = torch.ops.aten.abs.default
+        assert abs_op in backend
 
-    our_impl = backend[mul_op]
-    a = torch.tensor([1.0, 2.0, 3.0])
-    b = torch.tensor([4.0, 5.0, 6.0])
-    result = our_impl(a, b)
-    expected = mul_op(a, b)
+        our_impl = backend[abs_op]
+        x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+        result = our_impl(x)
+        expected = abs_op(x)
 
-    assert torch.allclose(result, expected)
+        assert torch.allclose(result, expected)
 
+    def test_sum_operation(self, backend):
+        sum_op = torch.ops.aten.sum.default
+        assert sum_op in backend
 
-def test_abs_operation(backend):
-    abs_op = torch.ops.aten.abs.default
-    assert abs_op in backend
+        our_impl = backend[sum_op]
+        x = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
+        result = our_impl(x)
+        expected = sum_op(x)
 
-    our_impl = backend[abs_op]
-    x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
-    result = our_impl(x)
-    expected = abs_op(x)
+        assert torch.allclose(result, expected)
 
-    assert torch.allclose(result, expected)
+    def test_backend_loading(self, backend):
+        loaded_ops = set(backend.compiled_kernels.keys())
+        assert len(loaded_ops) > 0
 
+        if os.path.exists("generated_kernels"):
+            dirs = [
+                d
+                for d in os.listdir("generated_kernels")
+                if os.path.isdir(os.path.join("generated_kernels", d))
+            ]
+            assert len(dirs) > 0
 
-def test_sum_operation(backend):
-    sum_op = torch.ops.aten.sum.default
-    assert sum_op in backend
+    def test_kernel_directories_exist(self, backend):
+        assert os.path.exists("generated_kernels")
 
-    our_impl = backend[sum_op]
-    x = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
-    result = our_impl(x)
-    expected = sum_op(x)
+        expected_dirs = ["relu", "add", "mul", "abs", "sum"]
+        for expected_dir in expected_dirs:
+            dir_path = os.path.join("generated_kernels", expected_dir)
+            assert os.path.isdir(dir_path)
 
-    assert torch.allclose(result, expected)
+            py_files = [f for f in os.listdir(dir_path) if f.endswith(".py")]
+            assert len(py_files) > 0
 
 
-def test_backend_loading(backend):
-    loaded_ops = set(backend.compiled_kernels.keys())
-    assert len(loaded_ops) > 0
+class TestDirectoryBackendCUDA:
+    @pytest.fixture(scope="class")
+    def backend(self):
+        # Always create correct test implementations, overriding any watermarked ones
+        import subprocess
 
-    if os.path.exists("generated_kernels"):
-        dirs = [
-            d
-            for d in os.listdir("generated_kernels")
-            if os.path.isdir(os.path.join("generated_kernels", d))
-        ]
-        assert len(dirs) > 0
+        subprocess.run(
+            [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True
+        )
 
+        return DirectoryBackend(ops_dir="generated_kernels")
 
-def test_kernel_directories_exist(backend):
-    assert os.path.exists("generated_kernels")
+    def test_relu_operation(self, backend):
+        relu_op = torch.ops.aten.relu.default
+        assert relu_op in backend
 
-    expected_dirs = ["relu", "add", "mul", "abs", "sum"]
-    for expected_dir in expected_dirs:
-        dir_path = os.path.join("generated_kernels", expected_dir)
-        assert os.path.isdir(dir_path)
+        our_impl = backend[relu_op]
+        x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+        result = our_impl(x)
+        expected = relu_op(x)
 
-        py_files = [f for f in os.listdir(dir_path) if f.endswith(".py")]
-        assert len(py_files) > 0
+        assert torch.allclose(result, expected)
+
+    def test_add_operation(self, backend):
+        add_op = torch.ops.aten.add.Tensor
+        assert add_op in backend
+
+        our_impl = backend[add_op]
+        a = torch.tensor([1.0, 2.0, 3.0])
+        b = torch.tensor([4.0, 5.0, 6.0])
+        result = our_impl(a, b)
+        expected = add_op(a, b)
+        print(f"result: {result}, expected: {expected}")
+
+        assert torch.allclose(result, expected)
+
+    def test_mul_operation(self, backend):
+        mul_op = torch.ops.aten.mul.Tensor
+        assert mul_op in backend
+
+        our_impl = backend[mul_op]
+        a = torch.tensor([1.0, 2.0, 3.0])
+        b = torch.tensor([4.0, 5.0, 6.0])
+        result = our_impl(a, b)
+        expected = mul_op(a, b)
+
+        assert torch.allclose(result, expected)
+
+    def test_abs_operation(self, backend):
+        abs_op = torch.ops.aten.abs.default
+        assert abs_op in backend
+
+        our_impl = backend[abs_op]
+        x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+        result = our_impl(x)
+        expected = abs_op(x)
+
+        assert torch.allclose(result, expected)
+
+    def test_sum_operation(self, backend):
+        sum_op = torch.ops.aten.sum.default
+        assert sum_op in backend
+
+        our_impl = backend[sum_op]
+        x = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
+        result = our_impl(x)
+        expected = sum_op(x)
+
+        assert torch.allclose(result, expected)
+
+    def test_backend_loading(self, backend):
+        loaded_ops = set(backend.compiled_kernels.keys())
+        assert len(loaded_ops) > 0
+
+        if os.path.exists("generated_kernels"):
+            dirs = [
+                d
+                for d in os.listdir("generated_kernels")
+                if os.path.isdir(os.path.join("generated_kernels", d))
+            ]
+            assert len(dirs) > 0
+
+    def test_kernel_directories_exist(self, backend):
+        assert os.path.exists("generated_kernels")
+
+        expected_dirs = ["relu", "add", "mul", "abs", "sum"]
+        for expected_dir in expected_dirs:
+            dir_path = os.path.join("generated_kernels", expected_dir)
+            assert os.path.isdir(dir_path)
+
+            py_files = [f for f in os.listdir(dir_path) if f.endswith(".py")]
+            assert len(py_files) > 0

From 93e5d4cb2d7918dd6fb3c0777c15448dcd0c785d Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Tue, 28 Oct 2025 15:57:44 -0700
Subject: [PATCH 02/19] update

---
 ..._ops.py => create_simple_test_ops_cuda.py} | 23 ++++---
 test/test_directory_backend.py                | 61 ++++---------------
 2 files changed, 28 insertions(+), 56 deletions(-)
 rename BackendBench/scripts/{create_simple_test_cuda_ops.py => create_simple_test_ops_cuda.py} (67%)

diff --git a/BackendBench/scripts/create_simple_test_cuda_ops.py b/BackendBench/scripts/create_simple_test_ops_cuda.py
similarity index 67%
rename from BackendBench/scripts/create_simple_test_cuda_ops.py
rename to BackendBench/scripts/create_simple_test_ops_cuda.py
index 7afa28bd..e526929e 100644
--- a/BackendBench/scripts/create_simple_test_cuda_ops.py
+++ b/BackendBench/scripts/create_simple_test_ops_cuda.py
@@ -11,15 +11,16 @@
 Each just calls the original PyTorch function.
 """
 
+import argparse
 import logging
 import os
 
 logger = logging.getLogger(__name__)
 
 
-def create_add():
-    os.makedirs("generated_kernels_cuda/add", exist_ok=True)
-    with open("generated_kernels_cuda/add/add_implementation_v1.cu", "w") as f:
+def create_add(base_dir):
+    os.makedirs(f"{base_dir}/add", exist_ok=True)
+    with open(f"{base_dir}/add/add_implementation_v1.cu", "w") as f:
         f.write("""
 __global__ void add_kernel(
     const float* __restrict__ x,
@@ -40,17 +41,23 @@ def create_add():
     return output;
 }
 """)
-    with open("generated_kernels_cuda/add/add_implementation_v1.cpp", "w") as f:
+    with open(f"{base_dir}/add/add_implementation_v1.cpp", "w") as f:
         f.write("""torch::Tensor add(torch::Tensor x, torch::Tensor y);""")
     logger.info("Created add implementation")
 
 
 def main():
-    """Create 5 simple test operations."""
-    logging.basicConfig(level=logging.INFO, format="%(message)s")
-    logger.info("Creating cuda kernel implementations for testing...")
+    """Create 1 simple test operations."""
+    parser = argparse.ArgumentParser(description="Creating cuda kernel implementations for testing")
+    parser.add_argument(
+        "--base-dir",
+        default="generated_kernels",
+        help="Base directory containing operator subdirectories",
+    )
 
-    create_add()
+    args = parser.parse_args()
+
+    create_add(args.base_dir)
 
 
 if __name__ == "__main__":
diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py
index f132b28e..46839967 100644
--- a/test/test_directory_backend.py
+++ b/test/test_directory_backend.py
@@ -113,76 +113,41 @@ def test_kernel_directories_exist(self, backend):
             assert len(py_files) > 0
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 class TestDirectoryBackendCUDA:
+    base_dir = "generated_kernels_cuda"
+
     @pytest.fixture(scope="class")
     def backend(self):
         # Always create correct test implementations, overriding any watermarked ones
         import subprocess
 
         subprocess.run(
-            [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True
+            [
+                sys.executable,
+                "-m",
+                "BackendBench.scripts.create_simple_test_ops_cuda",
+                "--base-dir",
+                self.base_dir,
+            ],
+            check=True,
         )
 
         return DirectoryBackend(ops_dir="generated_kernels")
 
-    def test_relu_operation(self, backend):
-        relu_op = torch.ops.aten.relu.default
-        assert relu_op in backend
-
-        our_impl = backend[relu_op]
-        x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
-        result = our_impl(x)
-        expected = relu_op(x)
-
-        assert torch.allclose(result, expected)
-
     def test_add_operation(self, backend):
         add_op = torch.ops.aten.add.Tensor
         assert add_op in backend
 
         our_impl = backend[add_op]
-        a = torch.tensor([1.0, 2.0, 3.0])
-        b = torch.tensor([4.0, 5.0, 6.0])
+        a = torch.tensor([1.0, 2.0, 3.0]).cuda()
+        b = torch.tensor([4.0, 5.0, 6.0]).cuda()
         result = our_impl(a, b)
         expected = add_op(a, b)
         print(f"result: {result}, expected: {expected}")
 
         assert torch.allclose(result, expected)
 
-    def test_mul_operation(self, backend):
-        mul_op = torch.ops.aten.mul.Tensor
-        assert mul_op in backend
-
-        our_impl = backend[mul_op]
-        a = torch.tensor([1.0, 2.0, 3.0])
-        b = torch.tensor([4.0, 5.0, 6.0])
-        result = our_impl(a, b)
-        expected = mul_op(a, b)
-
-        assert torch.allclose(result, expected)
-
-    def test_abs_operation(self, backend):
-        abs_op = torch.ops.aten.abs.default
-        assert abs_op in backend
-
-        our_impl = backend[abs_op]
-        x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
-        result = our_impl(x)
-        expected = abs_op(x)
-
-        assert torch.allclose(result, expected)
-
-    def test_sum_operation(self, backend):
-        sum_op = torch.ops.aten.sum.default
-        assert sum_op in backend
-
-        our_impl = backend[sum_op]
-        x = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
-        result = our_impl(x)
-        expected = sum_op(x)
-
-        assert torch.allclose(result, expected)
-
     def test_backend_loading(self, backend):
         loaded_ops = set(backend.compiled_kernels.keys())
         assert len(loaded_ops) > 0

From d7f80743b44546e30667e4cd2b0d1c576bf3f479 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Wed, 29 Oct 2025 21:56:23 -0700
Subject: [PATCH 03/19] fix

---
 BackendBench/backends/directory.py | 54 ++++++++----------------------
 1 file changed, 14 insertions(+), 40 deletions(-)

diff --git a/BackendBench/backends/directory.py b/BackendBench/backends/directory.py
index 36eb165b..38a778c2 100644
--- a/BackendBench/backends/directory.py
+++ b/BackendBench/backends/directory.py
@@ -11,7 +11,7 @@
 
 from torch.utils.cpp_extension import load_inline
 
-from ..utils import folder_name_to_op_name, get_pytorch_op, op_name_to_folder_name
+from ..utils import folder_name_to_op_name, get_pytorch_op
 from .base import Backend
 
 logger = logging.getLogger(__name__)
@@ -61,7 +61,7 @@ def _load_kernels(self):
 
             try:
                 op_name = folder_name_to_op_name(folder_name)
-                kernel_func = self._load_kernel_from_file(impl_path, op_name)
+                kernel_func = self._load_kernel_from_file(impl_path, folder_name)
 
                 pytorch_op = get_pytorch_op(op_name)
                 if pytorch_op:
@@ -74,13 +74,13 @@ def _load_kernels(self):
 
         logger.info(f"DirectoryBackend loaded {loaded_count} kernels from {self.ops_dir}/")
 
-    def _load_python_kernel(self, file_path: str, op_name: str) -> Callable:
+    def _load_python_kernel(self, file_path: str, folder_name: str) -> Callable:
         """
         Load a kernel implementation from a Python file.
 
         Args:
             file_path: Path to the Python implementation file
-            op_name: Base name of the operator (e.g., "add", "mul", "conv2d")
+            folder_name: Base name of the operator (e.g., "add__Tensor")
 
         Returns:
             Callable kernel implementation function
@@ -88,7 +88,6 @@ def _load_python_kernel(self, file_path: str, op_name: str) -> Callable:
         Raises:
             ValueError: If the expected kernel function is not found in the file
         """
-        folder_name = op_name_to_folder_name(op_name)
         spec = importlib.util.spec_from_file_location(f"op_{folder_name}", file_path)
         module = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(module)
@@ -99,38 +98,13 @@ def _load_python_kernel(self, file_path: str, op_name: str) -> Callable:
         else:
             raise ValueError(f"No function named {kernel_func_name} found in {file_path}")
 
-    def setup_cpp_extension(self):
-        global CPP_EXTENSION_IS_SETUP
-        if not CPP_EXTENSION_IS_SETUP:
-            from setuptools import setup
-            from torch.utils import cpp_extension
-
-            setup(
-                name="extension_cpp",
-                ext_modules=[
-                    cpp_extension.CppExtension(
-                        "extension_cpp",
-                        ["muladd.cpp"],
-                        # define Py_LIMITED_API with min version 3.9 to expose only the stable
-                        # limited API subset from Python.h
-                        extra_compile_args={"cxx": ["-DPy_LIMITED_API=0x03090000"]},
-                        py_limited_api=True,
-                    )
-                ],  # Build 1 wheel across multiple Python versions
-                cmdclass={"build_ext": cpp_extension.BuildExtension},
-                options={
-                    "bdist_wheel": {"py_limited_api": "cp39"}
-                },  # 3.9 is minimum supported Python version
-            )
-            CPP_EXTENSION_IS_SETUP = True
-
-    def _load_cuda_kernel(self, file_path: str, op_name: str) -> Callable:
+    def _load_cuda_kernel(self, file_path: str, folder_name: str) -> Callable:
         """
         Load and compile a kernel implementation from CUDA files using load_inline.
 
         Args:
             file_path: Path to the CUDA implementation file (.cu or .cpp)
-            op_name: Base name of the operator (e.g., "add__Tensor")
+            folder_name: Base name of the operator (e.g., "add__Tensor")
 
         Returns:
             Callable kernel implementation function
@@ -160,12 +134,12 @@ def _load_cuda_kernel(self, file_path: str, op_name: str) -> Callable:
         print(f"cpp_source: {cpp_source}")
 
         # Use load_inline for all cases
-        module_name = f"{op_name}_cuda_inline"
+        module_name = f"{folder_name}_cuda_inline"
         cuda_module = load_inline(
             name=module_name,
             cpp_sources=cpp_source,
             cuda_sources=cuda_source,
-            functions=[op_name],
+            functions=[folder_name],
             verbose=True,
         )
 
@@ -174,14 +148,14 @@ def _load_cuda_kernel(self, file_path: str, op_name: str) -> Callable:
 
         # print(getattr(cuda_module, op_name)(x, y))
 
-        if hasattr(cuda_module, op_name):
-            return getattr(cuda_module, op_name)
+        if hasattr(cuda_module, folder_name):
+            return getattr(cuda_module, folder_name)
         else:
             raise ValueError(
-                f"No function named {op_name} found in compiled CUDA module from {file_path}"
+                f"No function named {folder_name} found in compiled CUDA module from {file_path}"
             )
 
-    def ß(self, file_path: str, op_name: str) -> Callable:
+    def _load_kernel_from_file(self, file_path: str, folder_name: str) -> Callable:
         """
         Dynamically load a kernel implementation function from a Python or CUDA file.
 
@@ -202,9 +176,9 @@ def ß(self, file_path: str, op_name: str) -> Callable:
         file_ext = os.path.splitext(file_path)[1]
 
         if file_ext == ".py":
-            return self._load_python_kernel(file_path, op_name)
+            return self._load_python_kernel(file_path, folder_name)
         elif file_ext in [".cu", ".cpp"]:
-            return self._load_cuda_kernel(file_path, op_name)
+            return self._load_cuda_kernel(file_path, folder_name)
         else:
             raise ValueError(
                 f"Unsupported file extension {file_ext} for {file_path}. Expected .py, .cu, or .cpp"

From d1c864963027f65526b9c7a1da228d1564a6271a Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Wed, 29 Oct 2025 22:39:53 -0700
Subject: [PATCH 04/19] fix

---
 BackendBench/backends/directory.py            |   7 -
 .../scripts/create_simple_test_ops_cuda.py    |  14 +-
 test/test_directory_backend.py                | 122 ++++++++++--------
 3 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/BackendBench/backends/directory.py b/BackendBench/backends/directory.py
index 38a778c2..aea953c6 100644
--- a/BackendBench/backends/directory.py
+++ b/BackendBench/backends/directory.py
@@ -126,12 +126,10 @@ def _load_cuda_kernel(self, file_path: str, folder_name: str) -> Callable:
         if os.path.exists(cu_file):
             with open(cu_file, "r") as f:
                 cuda_source = f.read()
-        print(f"cuda_source: {cuda_source}")
 
         if os.path.exists(cpp_file):
             with open(cpp_file, "r") as f:
                 cpp_source = f.read()
-        print(f"cpp_source: {cpp_source}")
 
         # Use load_inline for all cases
         module_name = f"{folder_name}_cuda_inline"
@@ -143,11 +141,6 @@ def _load_cuda_kernel(self, file_path: str, folder_name: str) -> Callable:
             verbose=True,
         )
 
-        # x = torch.randn(4, 4, device="cuda", dtype=torch.float32)
-        # y = torch.randn(4, 4, device="cuda", dtype=torch.float32)
-
-        # print(getattr(cuda_module, op_name)(x, y))
-
         if hasattr(cuda_module, folder_name):
             return getattr(cuda_module, folder_name)
         else:
diff --git a/BackendBench/scripts/create_simple_test_ops_cuda.py b/BackendBench/scripts/create_simple_test_ops_cuda.py
index e526929e..e94d8bd2 100644
--- a/BackendBench/scripts/create_simple_test_ops_cuda.py
+++ b/BackendBench/scripts/create_simple_test_ops_cuda.py
@@ -19,10 +19,10 @@
 
 
 def create_add(base_dir):
-    os.makedirs(f"{base_dir}/add", exist_ok=True)
-    with open(f"{base_dir}/add/add_implementation_v1.cu", "w") as f:
+    os.makedirs(f"{base_dir}/add__Tensor", exist_ok=True)
+    with open(f"{base_dir}/add__Tensor/add__Tensor_implementation_v1.cu", "w") as f:
         f.write("""
-__global__ void add_kernel(
+__global__ void add__Tensor_kernel(
     const float* __restrict__ x,
     const float* __restrict__ y,
     float* __restrict__ output,
@@ -33,16 +33,16 @@ def create_add(base_dir):
     }
 }
 
-torch::Tensor add(torch::Tensor x, torch::Tensor y) {
+torch::Tensor add__Tensor(torch::Tensor x, torch::Tensor y) {
     auto output = torch::zeros_like(x);
     const int threads = 1024;
     const int blocks = (output.numel() + threads - 1) / threads;
-    add_kernel<<<blocks, threads>>>(x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
+    add__Tensor_kernel<<<blocks, threads>>>(x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
     return output;
 }
 """)
-    with open(f"{base_dir}/add/add_implementation_v1.cpp", "w") as f:
-        f.write("""torch::Tensor add(torch::Tensor x, torch::Tensor y);""")
+    with open(f"{base_dir}/add__Tensor/add__Tensor_implementation_v1.cpp", "w") as f:
+        f.write("""torch::Tensor add__Tensor(torch::Tensor x, torch::Tensor y);""")
     logger.info("Created add implementation")
 
 
diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py
index 46839967..603dc2bc 100644
--- a/test/test_directory_backend.py
+++ b/test/test_directory_backend.py
@@ -17,20 +17,25 @@
 import torch
 
 from BackendBench.backends import DirectoryBackend
+from BackendBench.utils import op_name_to_folder_name
 
 
-class TestDirectoryBackend:
-    @pytest.fixture(scope="class")
-    def backend(self):
-        # Always create correct test implementations, overriding any watermarked ones
-        import subprocess
+@pytest.fixture(scope="class")
+def backend(request):
+    # Always create correct test implementations, overriding any watermarked ones
+    import subprocess
+
+    subprocess.run(
+        [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True
+    )
+    yield DirectoryBackend(ops_dir="generated_kernels")
 
-        subprocess.run(
-            [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True
-        )
+    import shutil
 
-        return DirectoryBackend(ops_dir="generated_kernels")
+    shutil.rmtree("generated_kernels", ignore_errors=True)
 
+
+class TestDirectoryBackend:
     def test_relu_operation(self, backend):
         relu_op = torch.ops.aten.relu.default
         assert relu_op in backend
@@ -51,7 +56,6 @@ def test_add_operation(self, backend):
         b = torch.tensor([4.0, 5.0, 6.0])
         result = our_impl(a, b)
         expected = add_op(a, b)
-        print(f"result: {result}, expected: {expected}")
 
         assert torch.allclose(result, expected)
 
@@ -93,19 +97,20 @@ def test_backend_loading(self, backend):
         loaded_ops = set(backend.compiled_kernels.keys())
         assert len(loaded_ops) > 0
 
-        if os.path.exists("generated_kernels"):
-            dirs = [
-                d
-                for d in os.listdir("generated_kernels")
-                if os.path.isdir(os.path.join("generated_kernels", d))
-            ]
-            assert len(dirs) > 0
+        assert os.path.exists("generated_kernels")
+        dirs = [
+            d
+            for d in os.listdir("generated_kernels")
+            if os.path.isdir(os.path.join("generated_kernels", d))
+        ]
+        assert len(dirs) > 0
 
     def test_kernel_directories_exist(self, backend):
         assert os.path.exists("generated_kernels")
 
-        expected_dirs = ["relu", "add", "mul", "abs", "sum"]
-        for expected_dir in expected_dirs:
+        expected_ops = ["relu.default", "add.Tensor", "mul.Tensor", "abs.default", "sum.default"]
+        for expected_op in expected_ops:
+            expected_dir = op_name_to_folder_name(expected_op)
             dir_path = os.path.join("generated_kernels", expected_dir)
             assert os.path.isdir(dir_path)
 
@@ -113,60 +118,65 @@ def test_kernel_directories_exist(self, backend):
             assert len(py_files) > 0
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
-class TestDirectoryBackendCUDA:
-    base_dir = "generated_kernels_cuda"
+@pytest.fixture(scope="class")
+def backend_cuda(request):
+    import subprocess
 
-    @pytest.fixture(scope="class")
-    def backend(self):
-        # Always create correct test implementations, overriding any watermarked ones
-        import subprocess
+    # Access class attribute via request.cls
+    base_dir = getattr(request.cls, "base_dir", "generated_kernels_cuda")
+    subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "BackendBench.scripts.create_simple_test_ops_cuda",
+            "--base-dir",
+            base_dir,
+        ],
+        check=True,
+    )
+    backend_instance = DirectoryBackend(ops_dir=base_dir)
+    yield backend_instance
+    # Optional: Teardown logic here (e.g., remove base_dir directory)
+    import shutil
 
-        subprocess.run(
-            [
-                sys.executable,
-                "-m",
-                "BackendBench.scripts.create_simple_test_ops_cuda",
-                "--base-dir",
-                self.base_dir,
-            ],
-            check=True,
-        )
+    shutil.rmtree(base_dir, ignore_errors=True)
 
-        return DirectoryBackend(ops_dir="generated_kernels")
 
-    def test_add_operation(self, backend):
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
+class TestDirectoryBackendCUDA:
+    base_dir = "generated_kernels_cuda"
+
+    def test_add_operation(self, backend_cuda):
         add_op = torch.ops.aten.add.Tensor
-        assert add_op in backend
+        assert add_op in backend_cuda
 
-        our_impl = backend[add_op]
+        our_impl = backend_cuda[add_op]
         a = torch.tensor([1.0, 2.0, 3.0]).cuda()
         b = torch.tensor([4.0, 5.0, 6.0]).cuda()
         result = our_impl(a, b)
         expected = add_op(a, b)
-        print(f"result: {result}, expected: {expected}")
 
         assert torch.allclose(result, expected)
 
-    def test_backend_loading(self, backend):
-        loaded_ops = set(backend.compiled_kernels.keys())
+    def test_backend_loading(self, backend_cuda):
+        loaded_ops = set(backend_cuda.compiled_kernels.keys())
         assert len(loaded_ops) > 0
+        os.path.exists(self.base_dir)
 
-        if os.path.exists("generated_kernels"):
-            dirs = [
-                d
-                for d in os.listdir("generated_kernels")
-                if os.path.isdir(os.path.join("generated_kernels", d))
-            ]
-            assert len(dirs) > 0
+        dirs = [
+            d for d in os.listdir(self.base_dir) if os.path.isdir(os.path.join(self.base_dir, d))
+        ]
+        assert len(dirs) > 0
 
-    def test_kernel_directories_exist(self, backend):
-        assert os.path.exists("generated_kernels")
+    def test_kernel_directories_exist(self, backend_cuda):
+        assert os.path.exists(self.base_dir)
 
-        expected_dirs = ["relu", "add", "mul", "abs", "sum"]
+        expected_dirs = ["add__Tensor"]
         for expected_dir in expected_dirs:
-            dir_path = os.path.join("generated_kernels", expected_dir)
+            dir_path = os.path.join(self.base_dir, expected_dir)
             assert os.path.isdir(dir_path)
 
-            py_files = [f for f in os.listdir(dir_path) if f.endswith(".py")]
-            assert len(py_files) > 0
+            cuda_files = [
+                f for f in os.listdir(dir_path) if f.endswith(".cu") or f.endswith(".cpp")
+            ]
+            assert len(cuda_files) > 0

From 334a39eaa0767fad1092804a530f2b6e030a2ace Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Wed, 29 Oct 2025 22:47:15 -0700
Subject: [PATCH 05/19] add ninja to ci

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index ae6bafcc..a335d742 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "pandas",
     "datasets",
     "tenacity",
+    "ninja",
 ]
 
 [project.optional-dependencies]

From 08b7054b651e0e78a4af9596b6dd53c9bd7460c3 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Wed, 29 Oct 2025 23:00:32 -0700
Subject: [PATCH 06/19] set CUDA_HOME

---
 .github/workflows/smoke-test.yml | 35 +++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index b81d1b21..a49942d9 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -23,7 +23,40 @@ jobs:
     
     - name: Install package and dependencies
       run: uv sync --dev
-    
+      
+    - name: Set CUDA_HOME
+      run: |
+        # Find CUDA installation
+        if [ -d "/usr/local/cuda" ]; then
+          echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV
+          echo "Found CUDA at /usr/local/cuda"
+        elif [ -d "/usr/lib/cuda" ]; then
+          echo "CUDA_HOME=/usr/lib/cuda" >> $GITHUB_ENV
+          echo "Found CUDA at /usr/lib/cuda"
+        else
+          # Try to find CUDA using which nvcc
+          NVCC_PATH=$(which nvcc 2>/dev/null || echo "")
+          if [ -n "$NVCC_PATH" ]; then
+            CUDA_HOME=$(dirname $(dirname $NVCC_PATH))
+            echo "CUDA_HOME=$CUDA_HOME" >> $GITHUB_ENV
+            echo "Found CUDA at $CUDA_HOME"
+          else
+            echo "Warning: CUDA installation not found, tests may fail"
+          fi
+        fi
+        
+    - name: Verify CUDA setup
+      run: |
+        echo "CUDA_HOME: $CUDA_HOME"
+        if [ -n "$CUDA_HOME" ]; then
+          ls -la "$CUDA_HOME" || echo "CUDA_HOME directory does not exist"
+          if [ -f "$CUDA_HOME/bin/nvcc" ]; then
+            "$CUDA_HOME/bin/nvcc" --version
+          else
+            echo "nvcc not found at $CUDA_HOME/bin/nvcc"
+          fi
+        fi
+      
     - name: Clone FACTO source
       run: git clone https://github.com/pytorch-labs/FACTO.git
     

From 44a427b33bc19d6c58eb48d41efc28911c3a2d8e Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Wed, 29 Oct 2025 23:11:58 -0700
Subject: [PATCH 07/19] add skip

---
 test/test_directory_backend.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py
index 603dc2bc..c61440fb 100644
--- a/test/test_directory_backend.py
+++ b/test/test_directory_backend.py
@@ -143,6 +143,7 @@ def backend_cuda(request):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
+@pytest.mark.skipif("CUDA_HOME" not in os.environ, reason="CUDA_HOME is not available")
 class TestDirectoryBackendCUDA:
     base_dir = "generated_kernels_cuda"
 

From 1cf7b4d1d99006283c23d12d665e4dd915d01073 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Wed, 29 Oct 2025 23:12:18 -0700
Subject: [PATCH 08/19] fix

---
 .github/workflows/smoke-test.yml | 35 +-------------------------------
 1 file changed, 1 insertion(+), 34 deletions(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index a49942d9..b81d1b21 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -23,40 +23,7 @@ jobs:
     
     - name: Install package and dependencies
       run: uv sync --dev
-      
-    - name: Set CUDA_HOME
-      run: |
-        # Find CUDA installation
-        if [ -d "/usr/local/cuda" ]; then
-          echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV
-          echo "Found CUDA at /usr/local/cuda"
-        elif [ -d "/usr/lib/cuda" ]; then
-          echo "CUDA_HOME=/usr/lib/cuda" >> $GITHUB_ENV
-          echo "Found CUDA at /usr/lib/cuda"
-        else
-          # Try to find CUDA using which nvcc
-          NVCC_PATH=$(which nvcc 2>/dev/null || echo "")
-          if [ -n "$NVCC_PATH" ]; then
-            CUDA_HOME=$(dirname $(dirname $NVCC_PATH))
-            echo "CUDA_HOME=$CUDA_HOME" >> $GITHUB_ENV
-            echo "Found CUDA at $CUDA_HOME"
-          else
-            echo "Warning: CUDA installation not found, tests may fail"
-          fi
-        fi
-        
-    - name: Verify CUDA setup
-      run: |
-        echo "CUDA_HOME: $CUDA_HOME"
-        if [ -n "$CUDA_HOME" ]; then
-          ls -la "$CUDA_HOME" || echo "CUDA_HOME directory does not exist"
-          if [ -f "$CUDA_HOME/bin/nvcc" ]; then
-            "$CUDA_HOME/bin/nvcc" --version
-          else
-            echo "nvcc not found at $CUDA_HOME/bin/nvcc"
-          fi
-        fi
-      
+    
     - name: Clone FACTO source
       run: git clone https://github.com/pytorch-labs/FACTO.git
     

From 401473e3a9f5adb391044da7223c2496b9061a6f Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 11:09:50 -0700
Subject: [PATCH 09/19] add no_implicit_headers

---
 BackendBench/backends/directory.py            |  2 +-
 .../scripts/create_simple_test_ops_cuda.py    | 23 ++++++++++++-------
 test/test_directory_backend.py                |  5 ++--
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/BackendBench/backends/directory.py b/BackendBench/backends/directory.py
index aea953c6..4cb96528 100644
--- a/BackendBench/backends/directory.py
+++ b/BackendBench/backends/directory.py
@@ -138,7 +138,7 @@ def _load_cuda_kernel(self, file_path: str, folder_name: str) -> Callable:
             cpp_sources=cpp_source,
             cuda_sources=cuda_source,
             functions=[folder_name],
-            verbose=True,
+            no_implicit_headers=True,
         )
 
         if hasattr(cuda_module, folder_name):
diff --git a/BackendBench/scripts/create_simple_test_ops_cuda.py b/BackendBench/scripts/create_simple_test_ops_cuda.py
index e94d8bd2..b585a0d7 100644
--- a/BackendBench/scripts/create_simple_test_ops_cuda.py
+++ b/BackendBench/scripts/create_simple_test_ops_cuda.py
@@ -21,7 +21,9 @@
 def create_add(base_dir):
     os.makedirs(f"{base_dir}/add__Tensor", exist_ok=True)
     with open(f"{base_dir}/add__Tensor/add__Tensor_implementation_v1.cu", "w") as f:
-        f.write("""
+        f.write("""#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+
 __global__ void add__Tensor_kernel(
     const float* __restrict__ x,
     const float* __restrict__ y,
@@ -33,16 +35,21 @@ def create_add(base_dir):
     }
 }
 
-torch::Tensor add__Tensor(torch::Tensor x, torch::Tensor y) {
-    auto output = torch::zeros_like(x);
-    const int threads = 1024;
-    const int blocks = (output.numel() + threads - 1) / threads;
-    add__Tensor_kernel<<<blocks, threads>>>(x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
-    return output;
+at::Tensor add__Tensor(const at::Tensor& a, const at::Tensor& b) {
+    auto out = at::empty_like(a);
+    int64_t numel = a.numel();
+    const int threads = 256;
+    const int blocks = (numel + threads - 1) / threads;
+    add__Tensor_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
+        a.data_ptr<float>(), b.data_ptr<float>(), out.data_ptr<float>(), numel
+    );
+    return out;
 }
 """)
     with open(f"{base_dir}/add__Tensor/add__Tensor_implementation_v1.cpp", "w") as f:
-        f.write("""torch::Tensor add__Tensor(torch::Tensor x, torch::Tensor y);""")
+        f.write("""#include <torch/extension.h>
+
+at::Tensor add__Tensor(const at::Tensor& a, const at::Tensor& b);""")
     logger.info("Created add implementation")
 
 
diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py
index c61440fb..247fff0f 100644
--- a/test/test_directory_backend.py
+++ b/test/test_directory_backend.py
@@ -135,15 +135,16 @@ def backend_cuda(request):
         check=True,
     )
     backend_instance = DirectoryBackend(ops_dir=base_dir)
+
     yield backend_instance
-    # Optional: Teardown logic here (e.g., remove base_dir directory)
+
     import shutil
 
     shutil.rmtree(base_dir, ignore_errors=True)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
-@pytest.mark.skipif("CUDA_HOME" not in os.environ, reason="CUDA_HOME is not available")
+# @pytest.mark.skipif("CUDA_HOME" not in os.environ, reason="CUDA_HOME is not available")
 class TestDirectoryBackendCUDA:
     base_dir = "generated_kernels_cuda"
 

From 39eb648ddb1232042ea5c17b224b0befe70de901 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 11:19:41 -0700
Subject: [PATCH 10/19] test

---
 .github/workflows/smoke-test.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index b81d1b21..6a8b40e1 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -33,6 +33,9 @@ jobs:
     - name: Run smoke test
       run: uv run python -m BackendBench.scripts.main --suite smoke --backend aten 
     
+    - name: Find cuda version
+      run: nvcc --version | grep -oP 'release \K[0-9]+.[0- && ls /usr/local | grep cuda
+    
     - name: Run FACTO test
       run: uv run python -m BackendBench.scripts.main --suite facto --backend aten --ops "add.Tensor" 
     

From e417006764bd9635c0ffc5d6bab094ce2832c105 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 12:01:23 -0700
Subject: [PATCH 11/19] test cuda version

---
 .github/workflows/smoke-test.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index 6a8b40e1..39445833 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -33,8 +33,10 @@ jobs:
     - name: Run smoke test
       run: uv run python -m BackendBench.scripts.main --suite smoke --backend aten 
     
-    - name: Find cuda version
-      run: nvcc --version | grep -oP 'release \K[0-9]+.[0- && ls /usr/local | grep cuda
+    - name: Find CUDA version
+      run: |
+        nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+'
+        ls /usr/local | grep cuda
     
     - name: Run FACTO test
       run: uv run python -m BackendBench.scripts.main --suite facto --backend aten --ops "add.Tensor" 

From e65f8e9ef69278c3422486e8cc4607c885f20633 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 12:51:25 -0700
Subject: [PATCH 12/19] update

---
 .github/workflows/smoke-test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index 39445833..6a790651 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -35,7 +35,6 @@ jobs:
     
     - name: Find CUDA version
       run: |
-        nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+'
         ls /usr/local | grep cuda
     
     - name: Run FACTO test

From 945353d8841fda6631d064584b796e29acfe5534 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 12:58:42 -0700
Subject: [PATCH 13/19] install cuda toolkit in ci

---
 .github/workflows/smoke-test.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index 6a790651..bcb3aa12 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -14,6 +14,11 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
+
+    - name: Install CUDA Toolkit
+      uses: Jimver/cuda-toolkit@v0.2.18
+      with:
+        cuda: '12.4.0'
     
     - name: Install uv
       uses: astral-sh/setup-uv@v3
@@ -21,6 +26,10 @@ jobs:
     - name: Set up Python
       run: uv python install 3.13
     
+    - name: Find CUDA version
+      run: |
+        ls /usr/local | grep cuda
+    
     - name: Install package and dependencies
       run: uv sync --dev
     
@@ -33,9 +42,6 @@ jobs:
     - name: Run smoke test
       run: uv run python -m BackendBench.scripts.main --suite smoke --backend aten 
     
-    - name: Find CUDA version
-      run: |
-        ls /usr/local | grep cuda
     
     - name: Run FACTO test
       run: uv run python -m BackendBench.scripts.main --suite facto --backend aten --ops "add.Tensor" 

From c35add9d971bfb8ad7ce1444e47fbbc05a818bf6 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 12:58:56 -0700
Subject: [PATCH 14/19] install cuda toolkit in ci

---
 .github/workflows/smoke-test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index bcb3aa12..4e6ae315 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -26,9 +26,9 @@ jobs:
     - name: Set up Python
       run: uv python install 3.13
     
-    - name: Find CUDA version
-      run: |
-        ls /usr/local | grep cuda
+    # - name: Find CUDA version
+    #   run: |
+    #     ls /usr/local | grep cuda
     
     - name: Install package and dependencies
       run: uv sync --dev

From e99eb6af8dc70c635619f1388c4ea2ddc1c65782 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 13:19:56 -0700
Subject: [PATCH 15/19] install cuda toolkit in ci

---
 .github/workflows/smoke-test.yml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index 4e6ae315..d836ccde 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -14,11 +14,6 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-
-    - name: Install CUDA Toolkit
-      uses: Jimver/cuda-toolkit@v0.2.18
-      with:
-        cuda: '12.4.0'
     
     - name: Install uv
       uses: astral-sh/setup-uv@v3
@@ -31,7 +26,13 @@ jobs:
     #     ls /usr/local | grep cuda
     
     - name: Install package and dependencies
-      run: uv sync --dev
+      run: |
+        uv sync --dev
+        uv pip install "nvidia-cuda-runtime-cu12==12.4.127" \
+                "nvidia-cublas-cu12==12.4.5.8" \
+                "nvidia-cuda-nvrtc-cu12==12.4.127" \
+                "nvidia-cuda-nvcc-cu12==12.4.131"
+        python -m pip list | grep nvidia
     
     - name: Clone FACTO source
       run: git clone https://github.com/pytorch-labs/FACTO.git

From 05fe3b8bce396cd712beedb2575b748049ae835f Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 13:23:42 -0700
Subject: [PATCH 16/19] fix

---
 .github/workflows/smoke-test.yml | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index d836ccde..fa03a6a8 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -21,18 +21,23 @@ jobs:
     - name: Set up Python
       run: uv python install 3.13
     
-    # - name: Find CUDA version
-    #   run: |
-    #     ls /usr/local | grep cuda
-    
     - name: Install package and dependencies
+      run: uv sync --dev
+    
+    - name: Verify CUDA environment
       run: |
-        uv sync --dev
-        uv pip install "nvidia-cuda-runtime-cu12==12.4.127" \
-                "nvidia-cublas-cu12==12.4.5.8" \
-                "nvidia-cuda-nvrtc-cu12==12.4.127" \
-                "nvidia-cuda-nvcc-cu12==12.4.131"
-        python -m pip list | grep nvidia
+        python - <<'PY'
+        import torch, os, subprocess
+        print("PyTorch version:", torch.__version__)
+        print("CUDA available:", torch.cuda.is_available())
+        print("CUDA device count:", torch.cuda.device_count())
+        print("CUDA_HOME:", torch.utils.cpp_extension.CUDA_HOME)
+        try:
+            out = subprocess.check_output(["python", "-m", "nvidia.cuda.nvcc", "--version"], text=True)
+            print("NVCC version:\n", out)
+        except Exception as e:
+            print("NVCC not found:", e)
+        PY
     
     - name: Clone FACTO source
       run: git clone https://github.com/pytorch-labs/FACTO.git

From 734c93321374c5ae0b6d73ab3ebf073eedcd8361 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 13:25:18 -0700
Subject: [PATCH 17/19] fix

---
 .github/workflows/smoke-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index fa03a6a8..6e8e75e6 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -26,7 +26,7 @@ jobs:
     
     - name: Verify CUDA environment
       run: |
-        python - <<'PY'
+        uv run python - <<'PY'
         import torch, os, subprocess
         print("PyTorch version:", torch.__version__)
         print("CUDA available:", torch.cuda.is_available())

From 42fbe3606dbc2cadda8c7bdc998530bd163ee084 Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 13:28:54 -0700
Subject: [PATCH 18/19] fix

---
 .github/workflows/smoke-test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index 6e8e75e6..ee4af4d8 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -31,7 +31,8 @@ jobs:
         print("PyTorch version:", torch.__version__)
         print("CUDA available:", torch.cuda.is_available())
         print("CUDA device count:", torch.cuda.device_count())
-        print("CUDA_HOME:", torch.utils.cpp_extension.CUDA_HOME)
+        from torch.utils.cpp_extension import CUDA_HOME
+        print("CUDA_HOME:", CUDA_HOME)
         try:
             out = subprocess.check_output(["python", "-m", "nvidia.cuda.nvcc", "--version"], text=True)
             print("NVCC version:\n", out)

From f75fbaf79535b40a847cf8eb712677ae5c91887f Mon Sep 17 00:00:00 2001
From: Jiannan Wang <jiannanwang@meta.com>
Date: Thu, 30 Oct 2025 13:34:56 -0700
Subject: [PATCH 19/19] skip cuda testing in CI since no CUDA_HOME

---
 .github/workflows/smoke-test.yml | 16 ----------------
 test/test_directory_backend.py   |  7 ++++++-
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index ee4af4d8..2adf0052 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -24,22 +24,6 @@ jobs:
     - name: Install package and dependencies
       run: uv sync --dev
     
-    - name: Verify CUDA environment
-      run: |
-        uv run python - <<'PY'
-        import torch, os, subprocess
-        print("PyTorch version:", torch.__version__)
-        print("CUDA available:", torch.cuda.is_available())
-        print("CUDA device count:", torch.cuda.device_count())
-        from torch.utils.cpp_extension import CUDA_HOME
-        print("CUDA_HOME:", CUDA_HOME)
-        try:
-            out = subprocess.check_output(["python", "-m", "nvidia.cuda.nvcc", "--version"], text=True)
-            print("NVCC version:\n", out)
-        except Exception as e:
-            print("NVCC not found:", e)
-        PY
-    
     - name: Clone FACTO source
       run: git clone https://github.com/pytorch-labs/FACTO.git
     
diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py
index 247fff0f..1baaef2d 100644
--- a/test/test_directory_backend.py
+++ b/test/test_directory_backend.py
@@ -19,6 +19,11 @@
 from BackendBench.backends import DirectoryBackend
 from BackendBench.utils import op_name_to_folder_name
 
+try:
+    from torch.utils.cpp_extension import CUDA_HOME
+except ImportError:
+    CUDA_HOME = None
+
 
 @pytest.fixture(scope="class")
 def backend(request):
@@ -144,7 +149,7 @@ def backend_cuda(request):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
-# @pytest.mark.skipif("CUDA_HOME" not in os.environ, reason="CUDA_HOME is not available")
+@pytest.mark.skipif(CUDA_HOME is None, reason="CUDA_HOME is not available")
 class TestDirectoryBackendCUDA:
     base_dir = "generated_kernels_cuda"