diff --git a/src/Base/Array4.H b/src/Base/Array4.H index a0ede4cb..5f03c381 100644 --- a/src/Base/Array4.H +++ b/src/Base/Array4.H @@ -6,6 +6,7 @@ #pragma once #include "pyAMReX.H" +#include "dlpack/dlpack.h" #include #include @@ -185,6 +186,7 @@ namespace pyAMReX */ + /* // CPU: __array_interface__ v3 // https://numpy.org/doc/stable/reference/arrays.interface.html .def_property_readonly("__array_interface__", [](Array4 const & a4) { @@ -220,15 +222,80 @@ namespace pyAMReX d["version"] = 3; return d; }) + */ - // TODO: __dlpack__ __dlpack_device__ - // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.) + // DLPack v1.1 protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.) // https://dmlc.github.io/dlpack/latest/ - // https://data-apis.org/array-api/latest/design_topics/data_interchange.html - // https://github.com/data-apis/consortium-feedback/issues/1 // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol + .def("__dlpack__", []( + Array4 const &a4, + /* TODO: Handle keyword arguments */ + [[maybe_unused]] std::optional stream = std::nullopt, + [[maybe_unused]] std::optional> max_version = std::nullopt, + [[maybe_unused]] std::optional> dl_device = std::nullopt, + [[maybe_unused]] std::optional copy = std::nullopt + + ) + { + // Allocate shape/strides arrays + constexpr int ndim = 4; + auto const len = length(a4); + + // Construct DLManagedTensorVersioned (DLPack 1.1 standard) + auto *dl_mgt_tensor = new DLManagedTensorVersioned; + dl_mgt_tensor->version = DLPackVersion{}; + dl_mgt_tensor->flags = 0; // No special flags + dl_mgt_tensor->dl_tensor.data = const_cast(static_cast(a4.dataPtr())); + dl_mgt_tensor->dl_tensor.device = dlpack::detect_device_from_pointer(a4.dataPtr()); + dl_mgt_tensor->dl_tensor.ndim = ndim; + dl_mgt_tensor->dl_tensor.dtype = dlpack::get_dlpack_dtype(); + dl_mgt_tensor->dl_tensor.shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x}; + dl_mgt_tensor->dl_tensor.strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1}; + dl_mgt_tensor->dl_tensor.byte_offset = 0; + dl_mgt_tensor->manager_ctx = nullptr; // TODO: we can increase/decrease the Python ref counter of the producer here + dl_mgt_tensor->deleter = [](DLManagedTensorVersioned *self) { + delete[] self->dl_tensor.shape; + delete[] self->dl_tensor.strides; + delete self; + }; + // Return as Python capsule + return py::capsule( + dl_mgt_tensor, + "dltensor_versioned", + /*[](void* ptr) { + auto* tensor = static_cast(ptr); + tensor->deleter(tensor); + }*/ + [](PyObject *capsule) + { + if (PyCapsule_IsValid(capsule, "used_dltensor_versioned")) { + return; /* Do nothing if the capsule has been consumed. */ + } + auto *p = static_cast( + PyCapsule_GetPointer(capsule, "dltensor_versioned")); + if (p && p->deleter) + p->deleter(p); + } + ); + }, + py::arg("stream") = py::none(), + py::arg("max_version") = py::none(), + py::arg("dl_device") = py::none(), + py::arg("copy") = py::none(), + R"doc( + DLPack protocol for zero-copy tensor exchange. + See https://dmlc.github.io/dlpack/latest/ for details. + )doc" + ) + .def("__dlpack_device__", [](Array4 const &a4) { + DLDevice device = dlpack::detect_device_from_pointer(a4.dataPtr()); + return std::make_tuple(static_cast(device.device_type), device.device_id); + }, R"doc( + DLPack device info (device_type, device_id). + )doc") + .def("to_host", [](Array4 const & a4) { // py::tuple to std::vector diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 97a3e483..d8713f2f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,6 +2,7 @@ add_subdirectory(AmrCore) add_subdirectory(Base) #add_subdirectory(Boundary) +add_subdirectory(dlpack) #add_subdirectory(EB) #add_subdirectory(Extern) #add_subdirectory(LinearSolvers) diff --git a/src/amrex/extensions/Array4.py b/src/amrex/extensions/Array4.py index 8cd73f8a..64b06fab 100644 --- a/src/amrex/extensions/Array4.py +++ b/src/amrex/extensions/Array4.py @@ -1,7 +1,7 @@ """ This file is part of pyAMReX -Copyright 2023 AMReX community +Copyright 2023-2025 AMReX community Authors: Axel Huebl License: BSD-3-Clause-LBNL """ @@ -92,9 +92,52 @@ def array4_to_cupy(self, copy=False, order="F"): raise ValueError("The order argument must be F or C.") +def array4_to_dpnp(self, copy=False, order="F"): + """ + Provide a dpnp view into an Array4. + + This includes ngrow guard cells of the box. + + Note on the order of indices: + By default, this is as in AMReX in Fortran contiguous order, indexing as + x,y,z. This has performance implications for use in external libraries such + as dpnp. + The order="C" option will index as z,y,x and may perform better. + https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074 + + Parameters + ---------- + self : amrex.Array4_* + An Array4 class in pyAMReX + copy : bool, optional + Copy the data if true, otherwise create a view (default). + order : string, optional + F order (default) or C. C is faster with external libraries. + + Returns + ------- + dpnp.array + A dpnp n-dimensional array. + + Raises + ------ + ImportError + Raises an exception if dpnp is not installed + """ + import dpnp as dp + + if order == "F": + return dp.from_dlpack(self, copy=copy).T + elif order == "C": + return dp.from_dlpack(self, copy=copy) + else: + raise ValueError("The order argument must be F or C.") + + def array4_to_xp(self, copy=False, order="F"): """ - Provide a NumPy or CuPy view into an Array4, depending on amr.Config.have_gpu . + Provide a NumPy, CuPy or dpnp view into an Array4, depending on amr.Config.have_gpu + and amr.Config.gpu_backend . This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code: https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code @@ -120,14 +163,20 @@ def array4_to_xp(self, copy=False, order="F"): Returns ------- xp.array - A NumPy or CuPy n-dimensional array. + A NumPy, CuPy or dpnp n-dimensional array. """ import inspect amr = inspect.getmodule(self) - return ( - self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order) - ) + + if amr.Config.have_gpu: + if amr.Config.gpu_backend == "SYCL": + return self.to_dpnp(copy, order) + else: # if not SYCL use cupy + return self.to_cupy(copy, order) + + # if no GPU, use NumPy + return self.to_numpy(copy, order) def register_Array4_extension(amr): @@ -144,4 +193,5 @@ def register_Array4_extension(amr): ): Array4_type.to_numpy = array4_to_numpy Array4_type.to_cupy = array4_to_cupy + Array4_type.to_dpnp = array4_to_dpnp Array4_type.to_xp = array4_to_xp diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py index 61b5d159..dea96244 100644 --- a/src/amrex/extensions/MultiFab.py +++ b/src/amrex/extensions/MultiFab.py @@ -63,50 +63,10 @@ def mf_to_numpy(self, copy=False, order="F"): return views -def mf_to_cupy(self, copy=False, order="F"): - """ - Provide a CuPy view into a MultiFab. - - This includes ngrow guard cells of each box. - - Note on the order of indices: - By default, this is as in AMReX in Fortran contiguous order, indexing as - x,y,z. This has performance implications for use in external libraries such - as cupy. - The order="C" option will index as z,y,x and perform better with cupy. - https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074 - - Parameters - ---------- - self : amrex.MultiFab - A MultiFab class in pyAMReX - copy : bool, optional - Copy the data if true, otherwise create a view (default). - order : string, optional - F order (default) or C. C is faster with external libraries. - - Returns - ------- - list of cupy.array - A list of CuPy n-dimensional arrays, for each local block in the - MultiFab. - - Raises - ------ - ImportError - Raises an exception if cupy is not installed - """ - views = [] - for mfi in self: - views.append(self.array(mfi).to_cupy(copy, order)) - - return views - - def mf_to_xp(self, copy=False, order="F"): """ - Provide a NumPy or CuPy view into a MultiFab, - depending on amr.Config.have_gpu . + Provide a NumPy, CuPy or dpnp view into a MultiFab, + depending on amr.Config.have_gpu and amr.Config.gpu_backend . This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code: https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code @@ -132,15 +92,14 @@ def mf_to_xp(self, copy=False, order="F"): Returns ------- list of xp.array - A list of NumPy or CuPy n-dimensional arrays, for each local block in the - MultiFab. + A list of NumPy, CuPy or dpnp n-dimensional arrays, for each local block + in the MultiFab. """ - import inspect + views = [] + for mfi in self: + views.append(self.array(mfi).to_xp(copy, order)) - amr = inspect.getmodule(self) - return ( - self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order) - ) + return views def copy_multifab(amr, self): @@ -490,6 +449,10 @@ def __getitem__(self, index, with_internal_ghosts=False): Whether to include internal ghost cells. When true, data from ghost cells may be used that overlaps valid cells. """ + import inspect + + amr = inspect.getmodule(self) + index4 = _process_index(self, index) # Gather the data to be included in a list to be sent to other processes @@ -503,17 +466,19 @@ def __getitem__(self, index, with_internal_ghosts=False): device_arr = _get_field(self, mfi) slice_arr = device_arr[block_slices] try: - # Copy data from host to device using cupy syntax - slice_arr = slice_arr.get() + if amr.Config.gpu_backend == "SYCL": + import dpnp + + slice_arr = dpnp.asnumpy(slice_arr) + else: + # Copy data from host to device using cupy syntax + slice_arr = slice_arr.get() except AttributeError: # Array is already a numpy array on the host pass datalist.append((global_slices, slice_arr)) # Gather the data from all processors - import inspect - - amr = inspect.getmodule(self) if amr.Config.have_mpi: npes = amr.ParallelDescriptor.NProcs() else: @@ -604,7 +569,10 @@ def __setitem__(self, index, value): amr = inspect.getmodule(self) if amr.Config.have_gpu: - import cupy as xp + if amr.Config.gpu_backend == "SYCL": + import dpnp as xp + else: + import cupy as xp else: xp = np @@ -653,7 +621,6 @@ def register_MultiFab_extension(amr): amr.MultiFab.__iter__ = lambda mfab: amr.MFIter(mfab) amr.MultiFab.to_numpy = mf_to_numpy - amr.MultiFab.to_cupy = mf_to_cupy amr.MultiFab.to_xp = mf_to_xp amr.MultiFab.copy = lambda self: copy_multifab(amr, self) @@ -669,7 +636,6 @@ def register_MultiFab_extension(amr): amr.iMultiFab.__iter__ = lambda imfab: amr.MFIter(imfab) amr.iMultiFab.to_numpy = mf_to_numpy - amr.iMultiFab.to_cupy = mf_to_cupy amr.iMultiFab.to_xp = mf_to_xp amr.iMultiFab.copy = lambda self: copy_multifab(amr, self) diff --git a/src/dlpack/CMakeLists.txt b/src/dlpack/CMakeLists.txt new file mode 100644 index 00000000..2a27f9a7 --- /dev/null +++ b/src/dlpack/CMakeLists.txt @@ -0,0 +1,6 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + target_sources(pyAMReX_${D}d + PRIVATE + DLPack.cpp + ) +endforeach() diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp new file mode 100644 index 00000000..29d806a9 --- /dev/null +++ b/src/dlpack/DLPack.cpp @@ -0,0 +1,35 @@ +#include "pyAMReX.H" + +#include "dlpack.h" + + +void init_DLPack(py::module& m) +{ + using namespace amrex; + + // register types only if not already present, e.g., from another library + // that also implements DLPack bindings and exposes the types + + // TODO: py::type pyDLDeviceType = py::type::of(); + bool pyDLDeviceType = false; + if (!pyDLDeviceType) { + py::native_enum(m, "DLDeviceType", "enum.IntEnum") + .value("kDLCPU", DLDeviceType::kDLCPU) + .value("kDLCUDA", DLDeviceType::kDLCUDA) + .value("kDLCUDAHost", DLDeviceType::kDLCUDAHost) + .value("kDLOpenCL", DLDeviceType::kDLOpenCL) + .value("kDLVulkan", DLDeviceType::kDLVulkan) + .value("kDLMetal", DLDeviceType::kDLMetal) + .value("kDLVPI", DLDeviceType::kDLVPI) + .value("kDLROCM", DLDeviceType::kDLROCM) + .value("kDLROCMHost", DLDeviceType::kDLROCMHost) + .value("kDLExtDev", DLDeviceType::kDLExtDev) + .value("kDLCUDAManaged", DLDeviceType::kDLCUDAManaged) + .value("kDLOneAPI", DLDeviceType::kDLOneAPI) + .value("kDLWebGPU", DLDeviceType::kDLWebGPU) + .value("kDLHexagon", DLDeviceType::kDLHexagon) + .value("kDLMAIA", DLDeviceType::kDLMAIA) + ; + } + +} diff --git a/src/dlpack/dlpack.h b/src/dlpack/dlpack.h new file mode 100644 index 00000000..e25fb19f --- /dev/null +++ b/src/dlpack/dlpack.h @@ -0,0 +1,437 @@ +/*! +* Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + * + * Source: https://github.com/dmlc/dlpack/blob/v1.1/include/dlpack/dlpack.h + */ +#ifndef AMREX_DLPACK_H_ +#define AMREX_DLPACK_H_ + +#include +#include + +#include +#include + +extern "C" { + +#include +#include + +/*! + * \brief The DLPack version. + * + * A change in major version indicates that we have changed the + * data layout of the ABI - DLManagedTensorVersioned. + * + * A change in minor version indicates that we have added new + * code, such as a new device type, but the ABI is kept the same. + * + * If an obtained DLPack tensor has a major version that disagrees + * with the version number specified in this header file + * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter + * (and it is safe to do so). It is not safe to access any other fields + * as the memory layout will have changed. + * + * In the case of a minor version mismatch, the tensor can be safely used as + * long as the consumer knows how to interpret all fields. Minor version + * updates indicate the addition of enumeration values. + */ +typedef struct { + /*! \brief DLPack major version. */ + uint32_t major = 1; + /*! \brief DLPack minor version. */ + uint32_t minor = 1; +} DLPackVersion; + +/*! + * \brief The device type in DLDevice. + */ +typedef enum : int32_t { + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, + /*! \brief Microsoft MAIA devices */ + kDLMAIA = 17, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, + /*! \brief FP8 data types */ + kDLFloat8_e3m4 = 7U, + kDLFloat8_e4m3 = 8U, + kDLFloat8_e4m3b11fnuz = 9U, + kDLFloat8_e4m3fn = 10U, + kDLFloat8_e4m3fnuz = 11U, + kDLFloat8_e5m2 = 12U, + kDLFloat8_e5m2fnuz = 13U, + kDLFloat8_e8m0fnu = 14U, + /*! \brief FP6 data types + * Setting bits != 6 is currently unspecified, and the producer must ensure it is set + * while the consumer must stop importing if the value is unexpected. + */ + kDLFloat6_e2m3fn = 15U, + kDLFloat6_e3m2fn = 16U, + /*! \brief FP4 data types + * Setting bits != 4 is currently unspecified, and the producer must ensure it is set + * while the consumer must stop importing if the value is unexpected. + */ + kDLFloat4_e2m1fn = 17U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) + * - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory) + * - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory) + * - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory) + * + * When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e., + * for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element. + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + * + * Note that if the tensor is of size zero, then the data pointer should be + * set to `NULL`. + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +// bit masks used in the DLManagedTensorVersioned + +/*! \brief bit mask to indicate that the tensor is read only. */ +#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) + +/*! + * \brief bit mask to indicate that the tensor is a copy made by the producer. + * + * If set, the tensor is considered solely owned throughout its lifetime by the + * consumer, until the producer-provided deleter is invoked. + */ +#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL) + +/* + * \brief bit mask to indicate that whether a sub-byte type is packed or padded. + * + * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can + * be set by the producer to signal that a tensor of sub-byte type is padded. + */ +#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL) + +/*! + * \brief A versioned and managed C Tensor object, manage memory of DLTensor. + * + * This data structure is intended to facilitate the borrowing of DLTensor by + * another framework. It is not meant to transfer the tensor. When the borrowing + * framework doesn't need the tensor, it should call the deleter to notify the + * host that the resource is no longer needed. + * + * \note This is the current standard DLPack exchange data structure. + */ +struct DLManagedTensorVersioned { + /*! + * \brief The API and ABI version of the current managed Tensor + */ + DLPackVersion version; + /*! + * \brief the context of the original host framework. + * + * Stores DLManagedTensorVersioned is used in the + * framework. It can also be NULL. + */ + void *manager_ctx; + /*! + * \brief Destructor. + * + * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned. + * It can be NULL if there is no way for the caller to provide a reasonable + * destructor. The destructor deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensorVersioned *self); + /*! + * \brief Additional bitmask flags information about the tensor. + * + * By default the flags should be set to 0. + * + * \note Future ABI changes should keep everything until this field + * stable, to ensure that deleter can be correctly called. + * + * \sa DLPACK_FLAG_BITMASK_READ_ONLY + * \sa DLPACK_FLAG_BITMASK_IS_COPIED + */ + uint64_t flags; + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; +}; + +} // extern "C" + +namespace pyAMReX::dlpack +{ + + template + AMREX_INLINE + DLDataType get_dlpack_dtype () + { + using V = std::decay_t; + DLDataType dtype{}; + + if constexpr (std::is_same_v) { + dtype.code = kDLFloat; + dtype.bits = 32; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLFloat; + dtype.bits = 64; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLInt; + dtype.bits = 32; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLInt; + dtype.bits = 64; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLUInt; + dtype.bits = 32; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLUInt; + dtype.bits = 64; + dtype.lanes = 1; + } + else { + throw std::runtime_error("Unsupported dtype for DLPack"); + } + + return dtype; + } + + AMREX_INLINE + DLDevice detect_device_from_pointer ([[maybe_unused]] const void* ptr) + { + DLDevice device{ kDLCPU, 0 }; + +#ifdef AMREX_USE_CUDA + // Check if data is on GPU by checking if pointer is in CUDA memory + // note: cudaPointerGetAttributes is quite expensive, remove and + // assume device-side if need be. + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, ptr); + if (err == cudaSuccess && attr.type == cudaMemoryTypeDevice) { + device.device_type = kDLCUDA; + device.device_id = attr.device; + } +#elif defined(AMREX_USE_HIP) + // Check if data is on GPU by checking if pointer is in HIP memory + // note: hipPointerGetAttributes is quite expensive, remove and + // assume device-side if need be. + hipPointerAttribute_t attr; + hipError_t err = hipPointerGetAttributes(&attr, ptr); + if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { + device.device_type = kDLROCM; + device.device_id = attr.device; + } + +#elif defined(AMREX_USE_DPCPP) + // try { + // Get the SYCL context and queue from AMReX + auto const& queue = amrex::Gpu::Device::streamQueue(); + auto const& context = queue.get_context(); + + // Try to get pointer attributes using SYCL USM queries + auto usm_type = sycl::get_pointer_type(ptr, context); + + if (usm_type == sycl::usm::alloc::device || + usm_type == sycl::usm::alloc::shared) { + device.device_type = kDLOneAPI; + + // Try to get the actual device from the pointer + try { + auto device_ptr = sycl::get_pointer_device(ptr, context); + device.device_id = 0; // Default to first device + + auto devices = context.get_devices(); + for (size_t i = 0; i < devices.size(); ++i) { + if (devices[i] == device_ptr) { + device.device_id = static_cast(i); + break; + } + } + } catch (const sycl::exception&) { + // If we can't determine the specific device, default to 0 + device.device_id = 0; + } + } else if (usm_type == sycl::usm::alloc::host) { + // Host USM allocation - still oneAPI but accessible from host + device.device_type = kDLOneAPI; + device.device_id = 0; + } + // If usm_type is sycl::usm::alloc::unknown, it might be regular CPU memory + // In that case, we keep the default CPU device type set above + + /* + } + catch (const sycl::exception&) { + // If SYCL queries fail, assume it's regular CPU memory + // device remains as kDLCPU, 0 (set at function start) + } catch (...) { + // Handle any other exceptions gracefully + // device remains as kDLCPU, 0 (set at function start) + } + */ +#endif + + return device; + } + +} // namespace pyAMReX::dlpack + +#endif // AMREX_DLPACK_H_ diff --git a/src/pyAMReX.cpp b/src/pyAMReX.cpp index 36ce03d0..ab9ccfa9 100644 --- a/src/pyAMReX.cpp +++ b/src/pyAMReX.cpp @@ -20,6 +20,7 @@ void init_Arena(py::module&); void init_Array4(py::module&); void init_BaseFab(py::module&); void init_Box(py::module &); +void init_DLPack(py::module &); void init_RealBox(py::module &); void init_BoxArray(py::module &); void init_CoordSys(py::module&); @@ -98,6 +99,7 @@ PYBIND11_MODULE(amrex_3d_pybind, m) { // note: order from parent to child classes and argument usage + init_DLPack(m); init_AMReX(m); init_Arena(m); init_Dim3(m); diff --git a/tests/test_array4.py b/tests/test_array4.py index 99c37d12..44570837 100644 --- a/tests/test_array4.py +++ b/tests/test_array4.py @@ -31,7 +31,9 @@ def test_array4(): ) print(f"\nx: {x.__array_interface__} {x.dtype}") arr = amr.Array4_double(x) - print(f"arr: {arr.__array_interface__}") + print(f"arr: DLPack device info: {arr.__dlpack_device__()}") + # print(f"arr: DLPack: {arr.__dlpack__()}") + print(f"x.shape: {x.shape}") print(arr) assert arr.nComp == 1 @@ -44,16 +46,16 @@ def test_array4(): assert arr[0, 0, 0] == 1 assert arr[3, 2, 1] == 1 - # copy to numpy - c_arr2np = np.array(arr, copy=True) # segfaults on Windows + # copy to numpy using DLPack + c_arr2np = np.from_dlpack(arr) assert c_arr2np.ndim == 4 assert c_arr2np.dtype == np.dtype("double") print(f"c_arr2np: {c_arr2np.__array_interface__}") np.testing.assert_array_equal(x, c_arr2np[0, :, :, :]) assert c_arr2np[0, 1, 1, 1] == 42 - # view to numpy - v_arr2np = np.array(arr, copy=False) + # view to numpy using DLPack + v_arr2np = np.from_dlpack(arr) assert c_arr2np.ndim == 4 assert v_arr2np.dtype == np.dtype("double") np.testing.assert_array_equal(x, v_arr2np[0, :, :, :]) @@ -65,7 +67,7 @@ def test_array4(): # copy array4 (view) c_arr = amr.Array4_double(arr) - v_carr2np = np.array(c_arr, copy=False) + v_carr2np = np.from_dlpack(c_arr) x[1, 1, 1] = 44 assert v_carr2np[0, 1, 1, 1] == 44