From fd9109c0fe9a9a18e645c24b88c1ccc4406673b2 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Tue, 28 Oct 2025 14:08:50 -0700 Subject: [PATCH 01/19] add loadinline to support cuda kernel --- BackendBench/backends/directory.py | 125 +++++++++- .../scripts/create_simple_test_cuda_ops.py | 57 +++++ test/test_directory_backend.py | 225 ++++++++++++------ 3 files changed, 332 insertions(+), 75 deletions(-) create mode 100644 BackendBench/scripts/create_simple_test_cuda_ops.py diff --git a/BackendBench/backends/directory.py b/BackendBench/backends/directory.py index e58c5a6c..38aa1174 100644 --- a/BackendBench/backends/directory.py +++ b/BackendBench/backends/directory.py @@ -9,6 +9,8 @@ import os from typing import Callable, Dict +from torch.utils.cpp_extension import load_inline + from ..scripts.op_map import query from ..utils import get_pytorch_op from .base import Backend @@ -52,7 +54,8 @@ def _load_kernels(self): impl_files = [ f for f in os.listdir(op_dir) - if f.endswith(".py") and f.startswith(f"{op_name}_implementation") + if (f.endswith(".py") or f.endswith(".cu") or f.endswith(".cpp")) + and f.startswith(f"{op_name}_implementation") ] if not impl_files: logger.debug(f"No implementation files found in {op_dir}") @@ -69,6 +72,7 @@ def _load_kernels(self): for variant_info in op_variants: op_full_name = variant_info["op"] pytorch_op = get_pytorch_op(op_full_name) + print(f"pytorch_op: {pytorch_op}") if pytorch_op: self.compiled_kernels[pytorch_op] = kernel_func logger.info(f"Loaded {op_name} from {impl_file} -> {op_full_name}") @@ -82,13 +86,9 @@ def _load_kernels(self): logger.info(f"DirectoryBackend loaded {loaded_count} kernels from {self.ops_dir}/") - def _load_kernel_from_file(self, file_path: str, op_name: str) -> Callable: + def _load_python_kernel(self, file_path: str, op_name: str) -> Callable: """ - Dynamically load a kernel implementation function from a Python file. - - Each operator directory should contain implementation files that export a function - named {op_name}_kernel_impl. This function becomes the kernel implementation - that gets registered for all variants of the operator. + Load a kernel implementation from a Python file. Args: file_path: Path to the Python implementation file @@ -110,6 +110,117 @@ def _load_kernel_from_file(self, file_path: str, op_name: str) -> Callable: else: raise ValueError(f"No function named {kernel_func_name} found in {file_path}") + def setup_cpp_extension(self): + global CPP_EXTENSION_IS_SETUP + if not CPP_EXTENSION_IS_SETUP: + from setuptools import setup + from torch.utils import cpp_extension + + setup( + name="extension_cpp", + ext_modules=[ + cpp_extension.CppExtension( + "extension_cpp", + ["muladd.cpp"], + # define Py_LIMITED_API with min version 3.9 to expose only the stable + # limited API subset from Python.h + extra_compile_args={"cxx": ["-DPy_LIMITED_API=0x03090000"]}, + py_limited_api=True, + ) + ], # Build 1 wheel across multiple Python versions + cmdclass={"build_ext": cpp_extension.BuildExtension}, + options={ + "bdist_wheel": {"py_limited_api": "cp39"} + }, # 3.9 is minimum supported Python version + ) + CPP_EXTENSION_IS_SETUP = True + + def _load_cuda_kernel(self, file_path: str, op_name: str) -> Callable: + """ + Load and compile a kernel implementation from CUDA files using load_inline. + + Args: + file_path: Path to the CUDA implementation file (.cu or .cpp) + op_name: Base name of the operator (e.g., "add__Tensor") + + Returns: + Callable kernel implementation function + + Raises: + ValueError: If the expected kernel function is not found in the compiled module + """ + file_dir = os.path.dirname(file_path) + file_name = os.path.basename(file_path) + base_name = file_name.rsplit(".", 1)[0] + + cu_file = os.path.join(file_dir, f"{base_name}.cu") + cpp_file = os.path.join(file_dir, f"{base_name}.cpp") + + cpp_source = "" + cuda_source = "" + + # Read both files if they exist + if os.path.exists(cu_file): + with open(cu_file, "r") as f: + cuda_source = f.read() + print(f"cuda_source: {cuda_source}") + + if os.path.exists(cpp_file): + with open(cpp_file, "r") as f: + cpp_source = f.read() + print(f"cpp_source: {cpp_source}") + + # Use load_inline for all cases + module_name = f"{op_name}_cuda_inline" + cuda_module = load_inline( + name=module_name, + cpp_sources=cpp_source, + cuda_sources=cuda_source, + functions=[op_name], + verbose=True, + ) + + # x = torch.randn(4, 4, device="cuda", dtype=torch.float32) + # y = torch.randn(4, 4, device="cuda", dtype=torch.float32) + + # print(getattr(cuda_module, op_name)(x, y)) + + if hasattr(cuda_module, op_name): + return getattr(cuda_module, op_name) + else: + raise ValueError( + f"No function named {op_name} found in compiled CUDA module from {file_path}" + ) + + def _load_kernel_from_file(self, file_path: str, op_name: str) -> Callable: + """ + Dynamically load a kernel implementation function from a Python or CUDA file. + + Dispatches to the appropriate loader based on file extension: + - .py files -> _load_python_kernel + - .cu or .cpp files -> _load_cuda_kernel + + Args: + file_path: Path to the implementation file (Python or CUDA) + op_name: Base name of the operator (e.g., "add", "mul", "conv2d") + + Returns: + Callable kernel implementation function + + Raises: + ValueError: If the file extension is unsupported or the kernel function is not found + """ + file_ext = os.path.splitext(file_path)[1] + + if file_ext == ".py": + return self._load_python_kernel(file_path, op_name) + elif file_ext in [".cu", ".cpp"]: + return self._load_cuda_kernel(file_path, op_name) + else: + raise ValueError( + f"Unsupported file extension {file_ext} for {file_path}. Expected .py, .cu, or .cpp" + ) + def __getitem__(self, key): if key in self.compiled_kernels: return self.compiled_kernels[key] diff --git a/BackendBench/scripts/create_simple_test_cuda_ops.py b/BackendBench/scripts/create_simple_test_cuda_ops.py new file mode 100644 index 00000000..7afa28bd --- /dev/null +++ b/BackendBench/scripts/create_simple_test_cuda_ops.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + +""" +Create simple kernel implementations for 5 common operations. +Each just calls the original PyTorch function. +""" + +import logging +import os + +logger = logging.getLogger(__name__) + + +def create_add(): + os.makedirs("generated_kernels_cuda/add", exist_ok=True) + with open("generated_kernels_cuda/add/add_implementation_v1.cu", "w") as f: + f.write(""" +__global__ void add_kernel( + const float* __restrict__ x, + const float* __restrict__ y, + float* __restrict__ output, + const int size) { + const auto index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size) { + output[index] = x[index] + y[index]; + } +} + +torch::Tensor add(torch::Tensor x, torch::Tensor y) { + auto output = torch::zeros_like(x); + const int threads = 1024; + const int blocks = (output.numel() + threads - 1) / threads; + add_kernel<<>>(x.data(), y.data(), output.data(), output.numel()); + return output; +} +""") + with open("generated_kernels_cuda/add/add_implementation_v1.cpp", "w") as f: + f.write("""torch::Tensor add(torch::Tensor x, torch::Tensor y);""") + logger.info("Created add implementation") + + +def main(): + """Create 5 simple test operations.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + logger.info("Creating cuda kernel implementations for testing...") + + create_add() + + +if __name__ == "__main__": + main() diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py index f5662fc6..f132b28e 100644 --- a/test/test_directory_backend.py +++ b/test/test_directory_backend.py @@ -19,100 +19,189 @@ from BackendBench.backends import DirectoryBackend -@pytest.fixture(scope="module") -def backend(): - # Always create correct test implementations, overriding any watermarked ones - import subprocess +class TestDirectoryBackend: + @pytest.fixture(scope="class") + def backend(self): + # Always create correct test implementations, overriding any watermarked ones + import subprocess - subprocess.run( - [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True - ) + subprocess.run( + [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True + ) - return DirectoryBackend(ops_dir="generated_kernels") + return DirectoryBackend(ops_dir="generated_kernels") + def test_relu_operation(self, backend): + relu_op = torch.ops.aten.relu.default + assert relu_op in backend -def test_relu_operation(backend): - relu_op = torch.ops.aten.relu.default - assert relu_op in backend + our_impl = backend[relu_op] + x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + result = our_impl(x) + expected = relu_op(x) - our_impl = backend[relu_op] - x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) - result = our_impl(x) - expected = relu_op(x) + assert torch.allclose(result, expected) - assert torch.allclose(result, expected) + def test_add_operation(self, backend): + add_op = torch.ops.aten.add.Tensor + assert add_op in backend + our_impl = backend[add_op] + a = torch.tensor([1.0, 2.0, 3.0]) + b = torch.tensor([4.0, 5.0, 6.0]) + result = our_impl(a, b) + expected = add_op(a, b) + print(f"result: {result}, expected: {expected}") -def test_add_operation(backend): - add_op = torch.ops.aten.add.Tensor - assert add_op in backend + assert torch.allclose(result, expected) - our_impl = backend[add_op] - a = torch.tensor([1.0, 2.0, 3.0]) - b = torch.tensor([4.0, 5.0, 6.0]) - result = our_impl(a, b) - expected = add_op(a, b) + def test_mul_operation(self, backend): + mul_op = torch.ops.aten.mul.Tensor + assert mul_op in backend - assert torch.allclose(result, expected) + our_impl = backend[mul_op] + a = torch.tensor([1.0, 2.0, 3.0]) + b = torch.tensor([4.0, 5.0, 6.0]) + result = our_impl(a, b) + expected = mul_op(a, b) + assert torch.allclose(result, expected) -def test_mul_operation(backend): - mul_op = torch.ops.aten.mul.Tensor - assert mul_op in backend + def test_abs_operation(self, backend): + abs_op = torch.ops.aten.abs.default + assert abs_op in backend - our_impl = backend[mul_op] - a = torch.tensor([1.0, 2.0, 3.0]) - b = torch.tensor([4.0, 5.0, 6.0]) - result = our_impl(a, b) - expected = mul_op(a, b) + our_impl = backend[abs_op] + x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + result = our_impl(x) + expected = abs_op(x) - assert torch.allclose(result, expected) + assert torch.allclose(result, expected) + def test_sum_operation(self, backend): + sum_op = torch.ops.aten.sum.default + assert sum_op in backend -def test_abs_operation(backend): - abs_op = torch.ops.aten.abs.default - assert abs_op in backend + our_impl = backend[sum_op] + x = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) + result = our_impl(x) + expected = sum_op(x) - our_impl = backend[abs_op] - x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) - result = our_impl(x) - expected = abs_op(x) + assert torch.allclose(result, expected) - assert torch.allclose(result, expected) + def test_backend_loading(self, backend): + loaded_ops = set(backend.compiled_kernels.keys()) + assert len(loaded_ops) > 0 + if os.path.exists("generated_kernels"): + dirs = [ + d + for d in os.listdir("generated_kernels") + if os.path.isdir(os.path.join("generated_kernels", d)) + ] + assert len(dirs) > 0 -def test_sum_operation(backend): - sum_op = torch.ops.aten.sum.default - assert sum_op in backend + def test_kernel_directories_exist(self, backend): + assert os.path.exists("generated_kernels") - our_impl = backend[sum_op] - x = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) - result = our_impl(x) - expected = sum_op(x) + expected_dirs = ["relu", "add", "mul", "abs", "sum"] + for expected_dir in expected_dirs: + dir_path = os.path.join("generated_kernels", expected_dir) + assert os.path.isdir(dir_path) - assert torch.allclose(result, expected) + py_files = [f for f in os.listdir(dir_path) if f.endswith(".py")] + assert len(py_files) > 0 -def test_backend_loading(backend): - loaded_ops = set(backend.compiled_kernels.keys()) - assert len(loaded_ops) > 0 +class TestDirectoryBackendCUDA: + @pytest.fixture(scope="class") + def backend(self): + # Always create correct test implementations, overriding any watermarked ones + import subprocess - if os.path.exists("generated_kernels"): - dirs = [ - d - for d in os.listdir("generated_kernels") - if os.path.isdir(os.path.join("generated_kernels", d)) - ] - assert len(dirs) > 0 + subprocess.run( + [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True + ) + return DirectoryBackend(ops_dir="generated_kernels") -def test_kernel_directories_exist(backend): - assert os.path.exists("generated_kernels") + def test_relu_operation(self, backend): + relu_op = torch.ops.aten.relu.default + assert relu_op in backend - expected_dirs = ["relu", "add", "mul", "abs", "sum"] - for expected_dir in expected_dirs: - dir_path = os.path.join("generated_kernels", expected_dir) - assert os.path.isdir(dir_path) + our_impl = backend[relu_op] + x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + result = our_impl(x) + expected = relu_op(x) - py_files = [f for f in os.listdir(dir_path) if f.endswith(".py")] - assert len(py_files) > 0 + assert torch.allclose(result, expected) + + def test_add_operation(self, backend): + add_op = torch.ops.aten.add.Tensor + assert add_op in backend + + our_impl = backend[add_op] + a = torch.tensor([1.0, 2.0, 3.0]) + b = torch.tensor([4.0, 5.0, 6.0]) + result = our_impl(a, b) + expected = add_op(a, b) + print(f"result: {result}, expected: {expected}") + + assert torch.allclose(result, expected) + + def test_mul_operation(self, backend): + mul_op = torch.ops.aten.mul.Tensor + assert mul_op in backend + + our_impl = backend[mul_op] + a = torch.tensor([1.0, 2.0, 3.0]) + b = torch.tensor([4.0, 5.0, 6.0]) + result = our_impl(a, b) + expected = mul_op(a, b) + + assert torch.allclose(result, expected) + + def test_abs_operation(self, backend): + abs_op = torch.ops.aten.abs.default + assert abs_op in backend + + our_impl = backend[abs_op] + x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + result = our_impl(x) + expected = abs_op(x) + + assert torch.allclose(result, expected) + + def test_sum_operation(self, backend): + sum_op = torch.ops.aten.sum.default + assert sum_op in backend + + our_impl = backend[sum_op] + x = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) + result = our_impl(x) + expected = sum_op(x) + + assert torch.allclose(result, expected) + + def test_backend_loading(self, backend): + loaded_ops = set(backend.compiled_kernels.keys()) + assert len(loaded_ops) > 0 + + if os.path.exists("generated_kernels"): + dirs = [ + d + for d in os.listdir("generated_kernels") + if os.path.isdir(os.path.join("generated_kernels", d)) + ] + assert len(dirs) > 0 + + def test_kernel_directories_exist(self, backend): + assert os.path.exists("generated_kernels") + + expected_dirs = ["relu", "add", "mul", "abs", "sum"] + for expected_dir in expected_dirs: + dir_path = os.path.join("generated_kernels", expected_dir) + assert os.path.isdir(dir_path) + + py_files = [f for f in os.listdir(dir_path) if f.endswith(".py")] + assert len(py_files) > 0 From 93e5d4cb2d7918dd6fb3c0777c15448dcd0c785d Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Tue, 28 Oct 2025 15:57:44 -0700 Subject: [PATCH 02/19] update --- ..._ops.py => create_simple_test_ops_cuda.py} | 23 ++++--- test/test_directory_backend.py | 61 ++++--------------- 2 files changed, 28 insertions(+), 56 deletions(-) rename BackendBench/scripts/{create_simple_test_cuda_ops.py => create_simple_test_ops_cuda.py} (67%) diff --git a/BackendBench/scripts/create_simple_test_cuda_ops.py b/BackendBench/scripts/create_simple_test_ops_cuda.py similarity index 67% rename from BackendBench/scripts/create_simple_test_cuda_ops.py rename to BackendBench/scripts/create_simple_test_ops_cuda.py index 7afa28bd..e526929e 100644 --- a/BackendBench/scripts/create_simple_test_cuda_ops.py +++ b/BackendBench/scripts/create_simple_test_ops_cuda.py @@ -11,15 +11,16 @@ Each just calls the original PyTorch function. """ +import argparse import logging import os logger = logging.getLogger(__name__) -def create_add(): - os.makedirs("generated_kernels_cuda/add", exist_ok=True) - with open("generated_kernels_cuda/add/add_implementation_v1.cu", "w") as f: +def create_add(base_dir): + os.makedirs(f"{base_dir}/add", exist_ok=True) + with open(f"{base_dir}/add/add_implementation_v1.cu", "w") as f: f.write(""" __global__ void add_kernel( const float* __restrict__ x, @@ -40,17 +41,23 @@ def create_add(): return output; } """) - with open("generated_kernels_cuda/add/add_implementation_v1.cpp", "w") as f: + with open(f"{base_dir}/add/add_implementation_v1.cpp", "w") as f: f.write("""torch::Tensor add(torch::Tensor x, torch::Tensor y);""") logger.info("Created add implementation") def main(): - """Create 5 simple test operations.""" - logging.basicConfig(level=logging.INFO, format="%(message)s") - logger.info("Creating cuda kernel implementations for testing...") + """Create 1 simple test operations.""" + parser = argparse.ArgumentParser(description="Creating cuda kernel implementations for testing") + parser.add_argument( + "--base-dir", + default="generated_kernels", + help="Base directory containing operator subdirectories", + ) - create_add() + args = parser.parse_args() + + create_add(args.base_dir) if __name__ == "__main__": diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py index f132b28e..46839967 100644 --- a/test/test_directory_backend.py +++ b/test/test_directory_backend.py @@ -113,76 +113,41 @@ def test_kernel_directories_exist(self, backend): assert len(py_files) > 0 +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") class TestDirectoryBackendCUDA: + base_dir = "generated_kernels_cuda" + @pytest.fixture(scope="class") def backend(self): # Always create correct test implementations, overriding any watermarked ones import subprocess subprocess.run( - [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True + [ + sys.executable, + "-m", + "BackendBench.scripts.create_simple_test_ops_cuda", + "--base-dir", + self.base_dir, + ], + check=True, ) return DirectoryBackend(ops_dir="generated_kernels") - def test_relu_operation(self, backend): - relu_op = torch.ops.aten.relu.default - assert relu_op in backend - - our_impl = backend[relu_op] - x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) - result = our_impl(x) - expected = relu_op(x) - - assert torch.allclose(result, expected) - def test_add_operation(self, backend): add_op = torch.ops.aten.add.Tensor assert add_op in backend our_impl = backend[add_op] - a = torch.tensor([1.0, 2.0, 3.0]) - b = torch.tensor([4.0, 5.0, 6.0]) + a = torch.tensor([1.0, 2.0, 3.0]).cuda() + b = torch.tensor([4.0, 5.0, 6.0]).cuda() result = our_impl(a, b) expected = add_op(a, b) print(f"result: {result}, expected: {expected}") assert torch.allclose(result, expected) - def test_mul_operation(self, backend): - mul_op = torch.ops.aten.mul.Tensor - assert mul_op in backend - - our_impl = backend[mul_op] - a = torch.tensor([1.0, 2.0, 3.0]) - b = torch.tensor([4.0, 5.0, 6.0]) - result = our_impl(a, b) - expected = mul_op(a, b) - - assert torch.allclose(result, expected) - - def test_abs_operation(self, backend): - abs_op = torch.ops.aten.abs.default - assert abs_op in backend - - our_impl = backend[abs_op] - x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) - result = our_impl(x) - expected = abs_op(x) - - assert torch.allclose(result, expected) - - def test_sum_operation(self, backend): - sum_op = torch.ops.aten.sum.default - assert sum_op in backend - - our_impl = backend[sum_op] - x = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) - result = our_impl(x) - expected = sum_op(x) - - assert torch.allclose(result, expected) - def test_backend_loading(self, backend): loaded_ops = set(backend.compiled_kernels.keys()) assert len(loaded_ops) > 0 From d7f80743b44546e30667e4cd2b0d1c576bf3f479 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Wed, 29 Oct 2025 21:56:23 -0700 Subject: [PATCH 03/19] fix --- BackendBench/backends/directory.py | 54 ++++++++---------------------- 1 file changed, 14 insertions(+), 40 deletions(-) diff --git a/BackendBench/backends/directory.py b/BackendBench/backends/directory.py index 36eb165b..38a778c2 100644 --- a/BackendBench/backends/directory.py +++ b/BackendBench/backends/directory.py @@ -11,7 +11,7 @@ from torch.utils.cpp_extension import load_inline -from ..utils import folder_name_to_op_name, get_pytorch_op, op_name_to_folder_name +from ..utils import folder_name_to_op_name, get_pytorch_op from .base import Backend logger = logging.getLogger(__name__) @@ -61,7 +61,7 @@ def _load_kernels(self): try: op_name = folder_name_to_op_name(folder_name) - kernel_func = self._load_kernel_from_file(impl_path, op_name) + kernel_func = self._load_kernel_from_file(impl_path, folder_name) pytorch_op = get_pytorch_op(op_name) if pytorch_op: @@ -74,13 +74,13 @@ def _load_kernels(self): logger.info(f"DirectoryBackend loaded {loaded_count} kernels from {self.ops_dir}/") - def _load_python_kernel(self, file_path: str, op_name: str) -> Callable: + def _load_python_kernel(self, file_path: str, folder_name: str) -> Callable: """ Load a kernel implementation from a Python file. Args: file_path: Path to the Python implementation file - op_name: Base name of the operator (e.g., "add", "mul", "conv2d") + folder_name: Base name of the operator (e.g., "add__Tensor") Returns: Callable kernel implementation function @@ -88,7 +88,6 @@ def _load_python_kernel(self, file_path: str, op_name: str) -> Callable: Raises: ValueError: If the expected kernel function is not found in the file """ - folder_name = op_name_to_folder_name(op_name) spec = importlib.util.spec_from_file_location(f"op_{folder_name}", file_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) @@ -99,38 +98,13 @@ def _load_python_kernel(self, file_path: str, op_name: str) -> Callable: else: raise ValueError(f"No function named {kernel_func_name} found in {file_path}") - def setup_cpp_extension(self): - global CPP_EXTENSION_IS_SETUP - if not CPP_EXTENSION_IS_SETUP: - from setuptools import setup - from torch.utils import cpp_extension - - setup( - name="extension_cpp", - ext_modules=[ - cpp_extension.CppExtension( - "extension_cpp", - ["muladd.cpp"], - # define Py_LIMITED_API with min version 3.9 to expose only the stable - # limited API subset from Python.h - extra_compile_args={"cxx": ["-DPy_LIMITED_API=0x03090000"]}, - py_limited_api=True, - ) - ], # Build 1 wheel across multiple Python versions - cmdclass={"build_ext": cpp_extension.BuildExtension}, - options={ - "bdist_wheel": {"py_limited_api": "cp39"} - }, # 3.9 is minimum supported Python version - ) - CPP_EXTENSION_IS_SETUP = True - - def _load_cuda_kernel(self, file_path: str, op_name: str) -> Callable: + def _load_cuda_kernel(self, file_path: str, folder_name: str) -> Callable: """ Load and compile a kernel implementation from CUDA files using load_inline. Args: file_path: Path to the CUDA implementation file (.cu or .cpp) - op_name: Base name of the operator (e.g., "add__Tensor") + folder_name: Base name of the operator (e.g., "add__Tensor") Returns: Callable kernel implementation function @@ -160,12 +134,12 @@ def _load_cuda_kernel(self, file_path: str, op_name: str) -> Callable: print(f"cpp_source: {cpp_source}") # Use load_inline for all cases - module_name = f"{op_name}_cuda_inline" + module_name = f"{folder_name}_cuda_inline" cuda_module = load_inline( name=module_name, cpp_sources=cpp_source, cuda_sources=cuda_source, - functions=[op_name], + functions=[folder_name], verbose=True, ) @@ -174,14 +148,14 @@ def _load_cuda_kernel(self, file_path: str, op_name: str) -> Callable: # print(getattr(cuda_module, op_name)(x, y)) - if hasattr(cuda_module, op_name): - return getattr(cuda_module, op_name) + if hasattr(cuda_module, folder_name): + return getattr(cuda_module, folder_name) else: raise ValueError( - f"No function named {op_name} found in compiled CUDA module from {file_path}" + f"No function named {folder_name} found in compiled CUDA module from {file_path}" ) - def ß(self, file_path: str, op_name: str) -> Callable: + def _load_kernel_from_file(self, file_path: str, folder_name: str) -> Callable: """ Dynamically load a kernel implementation function from a Python or CUDA file. @@ -202,9 +176,9 @@ def ß(self, file_path: str, op_name: str) -> Callable: file_ext = os.path.splitext(file_path)[1] if file_ext == ".py": - return self._load_python_kernel(file_path, op_name) + return self._load_python_kernel(file_path, folder_name) elif file_ext in [".cu", ".cpp"]: - return self._load_cuda_kernel(file_path, op_name) + return self._load_cuda_kernel(file_path, folder_name) else: raise ValueError( f"Unsupported file extension {file_ext} for {file_path}. Expected .py, .cu, or .cpp" From d1c864963027f65526b9c7a1da228d1564a6271a Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Wed, 29 Oct 2025 22:39:53 -0700 Subject: [PATCH 04/19] fix --- BackendBench/backends/directory.py | 7 - .../scripts/create_simple_test_ops_cuda.py | 14 +- test/test_directory_backend.py | 122 ++++++++++-------- 3 files changed, 73 insertions(+), 70 deletions(-) diff --git a/BackendBench/backends/directory.py b/BackendBench/backends/directory.py index 38a778c2..aea953c6 100644 --- a/BackendBench/backends/directory.py +++ b/BackendBench/backends/directory.py @@ -126,12 +126,10 @@ def _load_cuda_kernel(self, file_path: str, folder_name: str) -> Callable: if os.path.exists(cu_file): with open(cu_file, "r") as f: cuda_source = f.read() - print(f"cuda_source: {cuda_source}") if os.path.exists(cpp_file): with open(cpp_file, "r") as f: cpp_source = f.read() - print(f"cpp_source: {cpp_source}") # Use load_inline for all cases module_name = f"{folder_name}_cuda_inline" @@ -143,11 +141,6 @@ def _load_cuda_kernel(self, file_path: str, folder_name: str) -> Callable: verbose=True, ) - # x = torch.randn(4, 4, device="cuda", dtype=torch.float32) - # y = torch.randn(4, 4, device="cuda", dtype=torch.float32) - - # print(getattr(cuda_module, op_name)(x, y)) - if hasattr(cuda_module, folder_name): return getattr(cuda_module, folder_name) else: diff --git a/BackendBench/scripts/create_simple_test_ops_cuda.py b/BackendBench/scripts/create_simple_test_ops_cuda.py index e526929e..e94d8bd2 100644 --- a/BackendBench/scripts/create_simple_test_ops_cuda.py +++ b/BackendBench/scripts/create_simple_test_ops_cuda.py @@ -19,10 +19,10 @@ def create_add(base_dir): - os.makedirs(f"{base_dir}/add", exist_ok=True) - with open(f"{base_dir}/add/add_implementation_v1.cu", "w") as f: + os.makedirs(f"{base_dir}/add__Tensor", exist_ok=True) + with open(f"{base_dir}/add__Tensor/add__Tensor_implementation_v1.cu", "w") as f: f.write(""" -__global__ void add_kernel( +__global__ void add__Tensor_kernel( const float* __restrict__ x, const float* __restrict__ y, float* __restrict__ output, @@ -33,16 +33,16 @@ def create_add(base_dir): } } -torch::Tensor add(torch::Tensor x, torch::Tensor y) { +torch::Tensor add__Tensor(torch::Tensor x, torch::Tensor y) { auto output = torch::zeros_like(x); const int threads = 1024; const int blocks = (output.numel() + threads - 1) / threads; - add_kernel<<>>(x.data(), y.data(), output.data(), output.numel()); + add__Tensor_kernel<<>>(x.data(), y.data(), output.data(), output.numel()); return output; } """) - with open(f"{base_dir}/add/add_implementation_v1.cpp", "w") as f: - f.write("""torch::Tensor add(torch::Tensor x, torch::Tensor y);""") + with open(f"{base_dir}/add__Tensor/add__Tensor_implementation_v1.cpp", "w") as f: + f.write("""torch::Tensor add__Tensor(torch::Tensor x, torch::Tensor y);""") logger.info("Created add implementation") diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py index 46839967..603dc2bc 100644 --- a/test/test_directory_backend.py +++ b/test/test_directory_backend.py @@ -17,20 +17,25 @@ import torch from BackendBench.backends import DirectoryBackend +from BackendBench.utils import op_name_to_folder_name -class TestDirectoryBackend: - @pytest.fixture(scope="class") - def backend(self): - # Always create correct test implementations, overriding any watermarked ones - import subprocess +@pytest.fixture(scope="class") +def backend(request): + # Always create correct test implementations, overriding any watermarked ones + import subprocess + + subprocess.run( + [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True + ) + yield DirectoryBackend(ops_dir="generated_kernels") - subprocess.run( - [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True - ) + import shutil - return DirectoryBackend(ops_dir="generated_kernels") + shutil.rmtree("generated_kernels", ignore_errors=True) + +class TestDirectoryBackend: def test_relu_operation(self, backend): relu_op = torch.ops.aten.relu.default assert relu_op in backend @@ -51,7 +56,6 @@ def test_add_operation(self, backend): b = torch.tensor([4.0, 5.0, 6.0]) result = our_impl(a, b) expected = add_op(a, b) - print(f"result: {result}, expected: {expected}") assert torch.allclose(result, expected) @@ -93,19 +97,20 @@ def test_backend_loading(self, backend): loaded_ops = set(backend.compiled_kernels.keys()) assert len(loaded_ops) > 0 - if os.path.exists("generated_kernels"): - dirs = [ - d - for d in os.listdir("generated_kernels") - if os.path.isdir(os.path.join("generated_kernels", d)) - ] - assert len(dirs) > 0 + assert os.path.exists("generated_kernels") + dirs = [ + d + for d in os.listdir("generated_kernels") + if os.path.isdir(os.path.join("generated_kernels", d)) + ] + assert len(dirs) > 0 def test_kernel_directories_exist(self, backend): assert os.path.exists("generated_kernels") - expected_dirs = ["relu", "add", "mul", "abs", "sum"] - for expected_dir in expected_dirs: + expected_ops = ["relu.default", "add.Tensor", "mul.Tensor", "abs.default", "sum.default"] + for expected_op in expected_ops: + expected_dir = op_name_to_folder_name(expected_op) dir_path = os.path.join("generated_kernels", expected_dir) assert os.path.isdir(dir_path) @@ -113,60 +118,65 @@ def test_kernel_directories_exist(self, backend): assert len(py_files) > 0 -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") -class TestDirectoryBackendCUDA: - base_dir = "generated_kernels_cuda" +@pytest.fixture(scope="class") +def backend_cuda(request): + import subprocess - @pytest.fixture(scope="class") - def backend(self): - # Always create correct test implementations, overriding any watermarked ones - import subprocess + # Access class attribute via request.cls + base_dir = getattr(request.cls, "base_dir", "generated_kernels_cuda") + subprocess.run( + [ + sys.executable, + "-m", + "BackendBench.scripts.create_simple_test_ops_cuda", + "--base-dir", + base_dir, + ], + check=True, + ) + backend_instance = DirectoryBackend(ops_dir=base_dir) + yield backend_instance + # Optional: Teardown logic here (e.g., remove base_dir directory) + import shutil - subprocess.run( - [ - sys.executable, - "-m", - "BackendBench.scripts.create_simple_test_ops_cuda", - "--base-dir", - self.base_dir, - ], - check=True, - ) + shutil.rmtree(base_dir, ignore_errors=True) - return DirectoryBackend(ops_dir="generated_kernels") - def test_add_operation(self, backend): +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") +class TestDirectoryBackendCUDA: + base_dir = "generated_kernels_cuda" + + def test_add_operation(self, backend_cuda): add_op = torch.ops.aten.add.Tensor - assert add_op in backend + assert add_op in backend_cuda - our_impl = backend[add_op] + our_impl = backend_cuda[add_op] a = torch.tensor([1.0, 2.0, 3.0]).cuda() b = torch.tensor([4.0, 5.0, 6.0]).cuda() result = our_impl(a, b) expected = add_op(a, b) - print(f"result: {result}, expected: {expected}") assert torch.allclose(result, expected) - def test_backend_loading(self, backend): - loaded_ops = set(backend.compiled_kernels.keys()) + def test_backend_loading(self, backend_cuda): + loaded_ops = set(backend_cuda.compiled_kernels.keys()) assert len(loaded_ops) > 0 + os.path.exists(self.base_dir) - if os.path.exists("generated_kernels"): - dirs = [ - d - for d in os.listdir("generated_kernels") - if os.path.isdir(os.path.join("generated_kernels", d)) - ] - assert len(dirs) > 0 + dirs = [ + d for d in os.listdir(self.base_dir) if os.path.isdir(os.path.join(self.base_dir, d)) + ] + assert len(dirs) > 0 - def test_kernel_directories_exist(self, backend): - assert os.path.exists("generated_kernels") + def test_kernel_directories_exist(self, backend_cuda): + assert os.path.exists(self.base_dir) - expected_dirs = ["relu", "add", "mul", "abs", "sum"] + expected_dirs = ["add__Tensor"] for expected_dir in expected_dirs: - dir_path = os.path.join("generated_kernels", expected_dir) + dir_path = os.path.join(self.base_dir, expected_dir) assert os.path.isdir(dir_path) - py_files = [f for f in os.listdir(dir_path) if f.endswith(".py")] - assert len(py_files) > 0 + cuda_files = [ + f for f in os.listdir(dir_path) if f.endswith(".cu") or f.endswith(".cpp") + ] + assert len(cuda_files) > 0 From 334a39eaa0767fad1092804a530f2b6e030a2ace Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Wed, 29 Oct 2025 22:47:15 -0700 Subject: [PATCH 05/19] add ninja to ci --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index ae6bafcc..a335d742 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "pandas", "datasets", "tenacity", + "ninja", ] [project.optional-dependencies] From 08b7054b651e0e78a4af9596b6dd53c9bd7460c3 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Wed, 29 Oct 2025 23:00:32 -0700 Subject: [PATCH 06/19] set CUDA_HOME --- .github/workflows/smoke-test.yml | 35 +++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index b81d1b21..a49942d9 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -23,7 +23,40 @@ jobs: - name: Install package and dependencies run: uv sync --dev - + + - name: Set CUDA_HOME + run: | + # Find CUDA installation + if [ -d "/usr/local/cuda" ]; then + echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV + echo "Found CUDA at /usr/local/cuda" + elif [ -d "/usr/lib/cuda" ]; then + echo "CUDA_HOME=/usr/lib/cuda" >> $GITHUB_ENV + echo "Found CUDA at /usr/lib/cuda" + else + # Try to find CUDA using which nvcc + NVCC_PATH=$(which nvcc 2>/dev/null || echo "") + if [ -n "$NVCC_PATH" ]; then + CUDA_HOME=$(dirname $(dirname $NVCC_PATH)) + echo "CUDA_HOME=$CUDA_HOME" >> $GITHUB_ENV + echo "Found CUDA at $CUDA_HOME" + else + echo "Warning: CUDA installation not found, tests may fail" + fi + fi + + - name: Verify CUDA setup + run: | + echo "CUDA_HOME: $CUDA_HOME" + if [ -n "$CUDA_HOME" ]; then + ls -la "$CUDA_HOME" || echo "CUDA_HOME directory does not exist" + if [ -f "$CUDA_HOME/bin/nvcc" ]; then + "$CUDA_HOME/bin/nvcc" --version + else + echo "nvcc not found at $CUDA_HOME/bin/nvcc" + fi + fi + - name: Clone FACTO source run: git clone https://github.com/pytorch-labs/FACTO.git From 44a427b33bc19d6c58eb48d41efc28911c3a2d8e Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Wed, 29 Oct 2025 23:11:58 -0700 Subject: [PATCH 07/19] add skip --- test/test_directory_backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py index 603dc2bc..c61440fb 100644 --- a/test/test_directory_backend.py +++ b/test/test_directory_backend.py @@ -143,6 +143,7 @@ def backend_cuda(request): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") +@pytest.mark.skipif("CUDA_HOME" not in os.environ, reason="CUDA_HOME is not available") class TestDirectoryBackendCUDA: base_dir = "generated_kernels_cuda" From 1cf7b4d1d99006283c23d12d665e4dd915d01073 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Wed, 29 Oct 2025 23:12:18 -0700 Subject: [PATCH 08/19] fix --- .github/workflows/smoke-test.yml | 35 +------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index a49942d9..b81d1b21 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -23,40 +23,7 @@ jobs: - name: Install package and dependencies run: uv sync --dev - - - name: Set CUDA_HOME - run: | - # Find CUDA installation - if [ -d "/usr/local/cuda" ]; then - echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV - echo "Found CUDA at /usr/local/cuda" - elif [ -d "/usr/lib/cuda" ]; then - echo "CUDA_HOME=/usr/lib/cuda" >> $GITHUB_ENV - echo "Found CUDA at /usr/lib/cuda" - else - # Try to find CUDA using which nvcc - NVCC_PATH=$(which nvcc 2>/dev/null || echo "") - if [ -n "$NVCC_PATH" ]; then - CUDA_HOME=$(dirname $(dirname $NVCC_PATH)) - echo "CUDA_HOME=$CUDA_HOME" >> $GITHUB_ENV - echo "Found CUDA at $CUDA_HOME" - else - echo "Warning: CUDA installation not found, tests may fail" - fi - fi - - - name: Verify CUDA setup - run: | - echo "CUDA_HOME: $CUDA_HOME" - if [ -n "$CUDA_HOME" ]; then - ls -la "$CUDA_HOME" || echo "CUDA_HOME directory does not exist" - if [ -f "$CUDA_HOME/bin/nvcc" ]; then - "$CUDA_HOME/bin/nvcc" --version - else - echo "nvcc not found at $CUDA_HOME/bin/nvcc" - fi - fi - + - name: Clone FACTO source run: git clone https://github.com/pytorch-labs/FACTO.git From 401473e3a9f5adb391044da7223c2496b9061a6f Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 11:09:50 -0700 Subject: [PATCH 09/19] add no_implicit_headers --- BackendBench/backends/directory.py | 2 +- .../scripts/create_simple_test_ops_cuda.py | 23 ++++++++++++------- test/test_directory_backend.py | 5 ++-- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/BackendBench/backends/directory.py b/BackendBench/backends/directory.py index aea953c6..4cb96528 100644 --- a/BackendBench/backends/directory.py +++ b/BackendBench/backends/directory.py @@ -138,7 +138,7 @@ def _load_cuda_kernel(self, file_path: str, folder_name: str) -> Callable: cpp_sources=cpp_source, cuda_sources=cuda_source, functions=[folder_name], - verbose=True, + no_implicit_headers=True, ) if hasattr(cuda_module, folder_name): diff --git a/BackendBench/scripts/create_simple_test_ops_cuda.py b/BackendBench/scripts/create_simple_test_ops_cuda.py index e94d8bd2..b585a0d7 100644 --- a/BackendBench/scripts/create_simple_test_ops_cuda.py +++ b/BackendBench/scripts/create_simple_test_ops_cuda.py @@ -21,7 +21,9 @@ def create_add(base_dir): os.makedirs(f"{base_dir}/add__Tensor", exist_ok=True) with open(f"{base_dir}/add__Tensor/add__Tensor_implementation_v1.cu", "w") as f: - f.write(""" + f.write("""#include +#include + __global__ void add__Tensor_kernel( const float* __restrict__ x, const float* __restrict__ y, @@ -33,16 +35,21 @@ def create_add(base_dir): } } -torch::Tensor add__Tensor(torch::Tensor x, torch::Tensor y) { - auto output = torch::zeros_like(x); - const int threads = 1024; - const int blocks = (output.numel() + threads - 1) / threads; - add__Tensor_kernel<<>>(x.data(), y.data(), output.data(), output.numel()); - return output; +at::Tensor add__Tensor(const at::Tensor& a, const at::Tensor& b) { + auto out = at::empty_like(a); + int64_t numel = a.numel(); + const int threads = 256; + const int blocks = (numel + threads - 1) / threads; + add__Tensor_kernel<<>>( + a.data_ptr(), b.data_ptr(), out.data_ptr(), numel + ); + return out; } """) with open(f"{base_dir}/add__Tensor/add__Tensor_implementation_v1.cpp", "w") as f: - f.write("""torch::Tensor add__Tensor(torch::Tensor x, torch::Tensor y);""") + f.write("""#include + +at::Tensor add__Tensor(const at::Tensor& a, const at::Tensor& b);""") logger.info("Created add implementation") diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py index c61440fb..247fff0f 100644 --- a/test/test_directory_backend.py +++ b/test/test_directory_backend.py @@ -135,15 +135,16 @@ def backend_cuda(request): check=True, ) backend_instance = DirectoryBackend(ops_dir=base_dir) + yield backend_instance - # Optional: Teardown logic here (e.g., remove base_dir directory) + import shutil shutil.rmtree(base_dir, ignore_errors=True) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") -@pytest.mark.skipif("CUDA_HOME" not in os.environ, reason="CUDA_HOME is not available") +# @pytest.mark.skipif("CUDA_HOME" not in os.environ, reason="CUDA_HOME is not available") class TestDirectoryBackendCUDA: base_dir = "generated_kernels_cuda" From 39eb648ddb1232042ea5c17b224b0befe70de901 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 11:19:41 -0700 Subject: [PATCH 10/19] test --- .github/workflows/smoke-test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index b81d1b21..6a8b40e1 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -33,6 +33,9 @@ jobs: - name: Run smoke test run: uv run python -m BackendBench.scripts.main --suite smoke --backend aten + - name: Find cuda version + run: nvcc --version | grep -oP 'release \K[0-9]+.[0- && ls /usr/local | grep cuda + - name: Run FACTO test run: uv run python -m BackendBench.scripts.main --suite facto --backend aten --ops "add.Tensor" From e417006764bd9635c0ffc5d6bab094ce2832c105 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 12:01:23 -0700 Subject: [PATCH 11/19] test cuda version --- .github/workflows/smoke-test.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index 6a8b40e1..39445833 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -33,8 +33,10 @@ jobs: - name: Run smoke test run: uv run python -m BackendBench.scripts.main --suite smoke --backend aten - - name: Find cuda version - run: nvcc --version | grep -oP 'release \K[0-9]+.[0- && ls /usr/local | grep cuda + - name: Find CUDA version + run: | + nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' + ls /usr/local | grep cuda - name: Run FACTO test run: uv run python -m BackendBench.scripts.main --suite facto --backend aten --ops "add.Tensor" From e65f8e9ef69278c3422486e8cc4607c885f20633 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 12:51:25 -0700 Subject: [PATCH 12/19] update --- .github/workflows/smoke-test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index 39445833..6a790651 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -35,7 +35,6 @@ jobs: - name: Find CUDA version run: | - nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' ls /usr/local | grep cuda - name: Run FACTO test From 945353d8841fda6631d064584b796e29acfe5534 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 12:58:42 -0700 Subject: [PATCH 13/19] install cuda toolkit in ci --- .github/workflows/smoke-test.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index 6a790651..bcb3aa12 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -14,6 +14,11 @@ jobs: steps: - uses: actions/checkout@v4 + + - name: Install CUDA Toolkit + uses: Jimver/cuda-toolkit@v0.2.18 + with: + cuda: '12.4.0' - name: Install uv uses: astral-sh/setup-uv@v3 @@ -21,6 +26,10 @@ jobs: - name: Set up Python run: uv python install 3.13 + - name: Find CUDA version + run: | + ls /usr/local | grep cuda + - name: Install package and dependencies run: uv sync --dev @@ -33,9 +42,6 @@ jobs: - name: Run smoke test run: uv run python -m BackendBench.scripts.main --suite smoke --backend aten - - name: Find CUDA version - run: | - ls /usr/local | grep cuda - name: Run FACTO test run: uv run python -m BackendBench.scripts.main --suite facto --backend aten --ops "add.Tensor" From c35add9d971bfb8ad7ce1444e47fbbc05a818bf6 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 12:58:56 -0700 Subject: [PATCH 14/19] install cuda toolkit in ci --- .github/workflows/smoke-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index bcb3aa12..4e6ae315 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -26,9 +26,9 @@ jobs: - name: Set up Python run: uv python install 3.13 - - name: Find CUDA version - run: | - ls /usr/local | grep cuda + # - name: Find CUDA version + # run: | + # ls /usr/local | grep cuda - name: Install package and dependencies run: uv sync --dev From e99eb6af8dc70c635619f1388c4ea2ddc1c65782 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 13:19:56 -0700 Subject: [PATCH 15/19] install cuda toolkit in ci --- .github/workflows/smoke-test.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index 4e6ae315..d836ccde 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -14,11 +14,6 @@ jobs: steps: - uses: actions/checkout@v4 - - - name: Install CUDA Toolkit - uses: Jimver/cuda-toolkit@v0.2.18 - with: - cuda: '12.4.0' - name: Install uv uses: astral-sh/setup-uv@v3 @@ -31,7 +26,13 @@ jobs: # ls /usr/local | grep cuda - name: Install package and dependencies - run: uv sync --dev + run: | + uv sync --dev + uv pip install "nvidia-cuda-runtime-cu12==12.4.127" \ + "nvidia-cublas-cu12==12.4.5.8" \ + "nvidia-cuda-nvrtc-cu12==12.4.127" \ + "nvidia-cuda-nvcc-cu12==12.4.131" + python -m pip list | grep nvidia - name: Clone FACTO source run: git clone https://github.com/pytorch-labs/FACTO.git From 05fe3b8bce396cd712beedb2575b748049ae835f Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 13:23:42 -0700 Subject: [PATCH 16/19] fix --- .github/workflows/smoke-test.yml | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index d836ccde..fa03a6a8 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -21,18 +21,23 @@ jobs: - name: Set up Python run: uv python install 3.13 - # - name: Find CUDA version - # run: | - # ls /usr/local | grep cuda - - name: Install package and dependencies + run: uv sync --dev + + - name: Verify CUDA environment run: | - uv sync --dev - uv pip install "nvidia-cuda-runtime-cu12==12.4.127" \ - "nvidia-cublas-cu12==12.4.5.8" \ - "nvidia-cuda-nvrtc-cu12==12.4.127" \ - "nvidia-cuda-nvcc-cu12==12.4.131" - python -m pip list | grep nvidia + python - <<'PY' + import torch, os, subprocess + print("PyTorch version:", torch.__version__) + print("CUDA available:", torch.cuda.is_available()) + print("CUDA device count:", torch.cuda.device_count()) + print("CUDA_HOME:", torch.utils.cpp_extension.CUDA_HOME) + try: + out = subprocess.check_output(["python", "-m", "nvidia.cuda.nvcc", "--version"], text=True) + print("NVCC version:\n", out) + except Exception as e: + print("NVCC not found:", e) + PY - name: Clone FACTO source run: git clone https://github.com/pytorch-labs/FACTO.git From 734c93321374c5ae0b6d73ab3ebf073eedcd8361 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 13:25:18 -0700 Subject: [PATCH 17/19] fix --- .github/workflows/smoke-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index fa03a6a8..6e8e75e6 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -26,7 +26,7 @@ jobs: - name: Verify CUDA environment run: | - python - <<'PY' + uv run python - <<'PY' import torch, os, subprocess print("PyTorch version:", torch.__version__) print("CUDA available:", torch.cuda.is_available()) From 42fbe3606dbc2cadda8c7bdc998530bd163ee084 Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 13:28:54 -0700 Subject: [PATCH 18/19] fix --- .github/workflows/smoke-test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index 6e8e75e6..ee4af4d8 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -31,7 +31,8 @@ jobs: print("PyTorch version:", torch.__version__) print("CUDA available:", torch.cuda.is_available()) print("CUDA device count:", torch.cuda.device_count()) - print("CUDA_HOME:", torch.utils.cpp_extension.CUDA_HOME) + from torch.utils.cpp_extension import CUDA_HOME + print("CUDA_HOME:", CUDA_HOME) try: out = subprocess.check_output(["python", "-m", "nvidia.cuda.nvcc", "--version"], text=True) print("NVCC version:\n", out) From f75fbaf79535b40a847cf8eb712677ae5c91887f Mon Sep 17 00:00:00 2001 From: Jiannan Wang Date: Thu, 30 Oct 2025 13:34:56 -0700 Subject: [PATCH 19/19] skip cuda testing in CI since no CUDA_HOME --- .github/workflows/smoke-test.yml | 16 ---------------- test/test_directory_backend.py | 7 ++++++- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index ee4af4d8..2adf0052 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -24,22 +24,6 @@ jobs: - name: Install package and dependencies run: uv sync --dev - - name: Verify CUDA environment - run: | - uv run python - <<'PY' - import torch, os, subprocess - print("PyTorch version:", torch.__version__) - print("CUDA available:", torch.cuda.is_available()) - print("CUDA device count:", torch.cuda.device_count()) - from torch.utils.cpp_extension import CUDA_HOME - print("CUDA_HOME:", CUDA_HOME) - try: - out = subprocess.check_output(["python", "-m", "nvidia.cuda.nvcc", "--version"], text=True) - print("NVCC version:\n", out) - except Exception as e: - print("NVCC not found:", e) - PY - - name: Clone FACTO source run: git clone https://github.com/pytorch-labs/FACTO.git diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py index 247fff0f..1baaef2d 100644 --- a/test/test_directory_backend.py +++ b/test/test_directory_backend.py @@ -19,6 +19,11 @@ from BackendBench.backends import DirectoryBackend from BackendBench.utils import op_name_to_folder_name +try: + from torch.utils.cpp_extension import CUDA_HOME +except ImportError: + CUDA_HOME = None + @pytest.fixture(scope="class") def backend(request): @@ -144,7 +149,7 @@ def backend_cuda(request): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") -# @pytest.mark.skipif("CUDA_HOME" not in os.environ, reason="CUDA_HOME is not available") +@pytest.mark.skipif(CUDA_HOME is None, reason="CUDA_HOME is not available") class TestDirectoryBackendCUDA: base_dir = "generated_kernels_cuda"