diff --git a/src/Base/Array4.H b/src/Base/Array4.H
index a0ede4cb..5f03c381 100644
--- a/src/Base/Array4.H
+++ b/src/Base/Array4.H
@@ -6,6 +6,7 @@
 #pragma once
 
 #include "pyAMReX.H"
+#include "dlpack/dlpack.h"
 
 #include <AMReX_Array4.H>
 #include <AMReX_BLassert.H>
@@ -185,6 +186,7 @@ namespace pyAMReX
              */
 
 
+            /*
             // CPU: __array_interface__ v3
             // https://numpy.org/doc/stable/reference/arrays.interface.html
             .def_property_readonly("__array_interface__", [](Array4<T> const & a4) {
@@ -220,15 +222,80 @@ namespace pyAMReX
                 d["version"] = 3;
                 return d;
             })
+            */
 
 
-            // TODO: __dlpack__ __dlpack_device__
-            // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
+            // DLPack v1.1 protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
             // https://dmlc.github.io/dlpack/latest/
-            // https://data-apis.org/array-api/latest/design_topics/data_interchange.html
-            // https://github.com/data-apis/consortium-feedback/issues/1
             // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
             // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
+            .def("__dlpack__", [](
+                    Array4<T> const &a4,
+                    /* TODO: Handle keyword arguments */
+                    [[maybe_unused]] std::optional<py::handle> stream = std::nullopt,
+                    [[maybe_unused]] std::optional<std::tuple<int, int>> max_version = std::nullopt,
+                    [[maybe_unused]] std::optional<std::tuple<DLDeviceType, int32_t>> dl_device = std::nullopt,
+                    [[maybe_unused]] std::optional<bool> copy = std::nullopt
+
+            )
+            {
+                // Allocate shape/strides arrays
+                constexpr int ndim = 4;
+                auto const len = length(a4);
+
+                // Construct DLManagedTensorVersioned (DLPack 1.1 standard)
+                auto *dl_mgt_tensor = new DLManagedTensorVersioned;
+                dl_mgt_tensor->version = DLPackVersion{};
+                dl_mgt_tensor->flags = 0; // No special flags
+                dl_mgt_tensor->dl_tensor.data = const_cast<void*>(static_cast<const void*>(a4.dataPtr()));
+                dl_mgt_tensor->dl_tensor.device = dlpack::detect_device_from_pointer(a4.dataPtr());
+                dl_mgt_tensor->dl_tensor.ndim = ndim;
+                dl_mgt_tensor->dl_tensor.dtype = dlpack::get_dlpack_dtype<T>();
+                dl_mgt_tensor->dl_tensor.shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x};
+                dl_mgt_tensor->dl_tensor.strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1};
+                dl_mgt_tensor->dl_tensor.byte_offset = 0;
+                dl_mgt_tensor->manager_ctx = nullptr;  // TODO: we can increase/decrease the Python ref counter of the producer here
+                dl_mgt_tensor->deleter = [](DLManagedTensorVersioned *self) {
+                    delete[] self->dl_tensor.shape;
+                    delete[] self->dl_tensor.strides;
+                    delete self;
+                };
+                // Return as Python capsule
+                return py::capsule(
+                    dl_mgt_tensor,
+                    "dltensor_versioned",
+                    /*[](void* ptr) {
+                        auto* tensor = static_cast<DLManagedTensorVersioned*>(ptr);
+                        tensor->deleter(tensor);
+                    }*/
+                    [](PyObject *capsule)
+                    {
+                        if (PyCapsule_IsValid(capsule, "used_dltensor_versioned")) {
+                            return;  /* Do nothing if the capsule has been consumed. */
+                        }
+                        auto *p = static_cast<DLManagedTensorVersioned*>(
+                            PyCapsule_GetPointer(capsule, "dltensor_versioned"));
+                        if (p && p->deleter)
+                            p->deleter(p);
+                    }
+                );
+            },
+                py::arg("stream") = py::none(),
+                py::arg("max_version") = py::none(),
+                py::arg("dl_device") = py::none(),
+                py::arg("copy") = py::none(),
+                R"doc(
+                DLPack protocol for zero-copy tensor exchange.
+                See https://dmlc.github.io/dlpack/latest/ for details.
+                )doc"
+            )
+            .def("__dlpack_device__", [](Array4<T> const &a4) {
+                DLDevice device = dlpack::detect_device_from_pointer(a4.dataPtr());
+                return std::make_tuple(static_cast<int32_t>(device.device_type), device.device_id);
+            }, R"doc(
+                DLPack device info (device_type, device_id).
+            )doc")
+
 
             .def("to_host", [](Array4<T> const & a4) {
                 // py::tuple to std::vector
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 97a3e483..d8713f2f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_subdirectory(AmrCore)
 add_subdirectory(Base)
 #add_subdirectory(Boundary)
+add_subdirectory(dlpack)
 #add_subdirectory(EB)
 #add_subdirectory(Extern)
 #add_subdirectory(LinearSolvers)
diff --git a/src/amrex/extensions/Array4.py b/src/amrex/extensions/Array4.py
index 8cd73f8a..64b06fab 100644
--- a/src/amrex/extensions/Array4.py
+++ b/src/amrex/extensions/Array4.py
@@ -1,7 +1,7 @@
 """
 This file is part of pyAMReX
 
-Copyright 2023 AMReX community
+Copyright 2023-2025 AMReX community
 Authors: Axel Huebl
 License: BSD-3-Clause-LBNL
 """
@@ -92,9 +92,52 @@ def array4_to_cupy(self, copy=False, order="F"):
         raise ValueError("The order argument must be F or C.")
 
 
+def array4_to_dpnp(self, copy=False, order="F"):
+    """
+    Provide a dpnp view into an Array4.
+
+    This includes ngrow guard cells of the box.
+
+    Note on the order of indices:
+    By default, this is as in AMReX in Fortran contiguous order, indexing as
+    x,y,z. This has performance implications for use in external libraries such
+    as dpnp.
+    The order="C" option will index as z,y,x and may perform better.
+    https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074
+
+    Parameters
+    ----------
+    self : amrex.Array4_*
+        An Array4 class in pyAMReX
+    copy : bool, optional
+        Copy the data if true, otherwise create a view (default).
+    order : string, optional
+        F order (default) or C. C is faster with external libraries.
+
+    Returns
+    -------
+    dpnp.array
+        A dpnp n-dimensional array.
+
+    Raises
+    ------
+    ImportError
+        Raises an exception if dpnp is not installed
+    """
+    import dpnp as dp
+
+    if order == "F":
+        return dp.from_dlpack(self, copy=copy).T
+    elif order == "C":
+        return dp.from_dlpack(self, copy=copy)
+    else:
+        raise ValueError("The order argument must be F or C.")
+
+
 def array4_to_xp(self, copy=False, order="F"):
     """
-    Provide a NumPy or CuPy view into an Array4, depending on amr.Config.have_gpu .
+    Provide a NumPy, CuPy or dpnp view into an Array4, depending on amr.Config.have_gpu
+    and amr.Config.gpu_backend .
 
     This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code:
     https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code
@@ -120,14 +163,20 @@ def array4_to_xp(self, copy=False, order="F"):
     Returns
     -------
     xp.array
-        A NumPy or CuPy n-dimensional array.
+        A NumPy, CuPy or dpnp n-dimensional array.
     """
     import inspect
 
     amr = inspect.getmodule(self)
-    return (
-        self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order)
-    )
+
+    if amr.Config.have_gpu:
+        if amr.Config.gpu_backend == "SYCL":
+            return self.to_dpnp(copy, order)
+        else:  # if not SYCL use cupy
+            return self.to_cupy(copy, order)
+
+    # if no GPU, use NumPy
+    return self.to_numpy(copy, order)
 
 
 def register_Array4_extension(amr):
@@ -144,4 +193,5 @@ def register_Array4_extension(amr):
     ):
         Array4_type.to_numpy = array4_to_numpy
         Array4_type.to_cupy = array4_to_cupy
+        Array4_type.to_dpnp = array4_to_dpnp
         Array4_type.to_xp = array4_to_xp
diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py
index 61b5d159..dea96244 100644
--- a/src/amrex/extensions/MultiFab.py
+++ b/src/amrex/extensions/MultiFab.py
@@ -63,50 +63,10 @@ def mf_to_numpy(self, copy=False, order="F"):
     return views
 
 
-def mf_to_cupy(self, copy=False, order="F"):
-    """
-    Provide a CuPy view into a MultiFab.
-
-    This includes ngrow guard cells of each box.
-
-    Note on the order of indices:
-    By default, this is as in AMReX in Fortran contiguous order, indexing as
-    x,y,z. This has performance implications for use in external libraries such
-    as cupy.
-    The order="C" option will index as z,y,x and perform better with cupy.
-    https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074
-
-    Parameters
-    ----------
-    self : amrex.MultiFab
-        A MultiFab class in pyAMReX
-    copy : bool, optional
-        Copy the data if true, otherwise create a view (default).
-    order : string, optional
-        F order (default) or C. C is faster with external libraries.
-
-    Returns
-    -------
-    list of cupy.array
-        A list of CuPy n-dimensional arrays, for each local block in the
-        MultiFab.
-
-    Raises
-    ------
-    ImportError
-        Raises an exception if cupy is not installed
-    """
-    views = []
-    for mfi in self:
-        views.append(self.array(mfi).to_cupy(copy, order))
-
-    return views
-
-
 def mf_to_xp(self, copy=False, order="F"):
     """
-    Provide a NumPy or CuPy view into a MultiFab,
-    depending on amr.Config.have_gpu .
+    Provide a NumPy, CuPy or dpnp view into a MultiFab,
+    depending on amr.Config.have_gpu and amr.Config.gpu_backend .
 
     This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code:
     https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code
@@ -132,15 +92,14 @@ def mf_to_xp(self, copy=False, order="F"):
     Returns
     -------
     list of xp.array
-        A list of NumPy or CuPy n-dimensional arrays, for each local block in the
-        MultiFab.
+        A list of NumPy, CuPy or dpnp n-dimensional arrays, for each local block
+        in the MultiFab.
     """
-    import inspect
+    views = []
+    for mfi in self:
+        views.append(self.array(mfi).to_xp(copy, order))
 
-    amr = inspect.getmodule(self)
-    return (
-        self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order)
-    )
+    return views
 
 
 def copy_multifab(amr, self):
@@ -490,6 +449,10 @@ def __getitem__(self, index, with_internal_ghosts=False):
         Whether to include internal ghost cells. When true, data from ghost cells may be used that
         overlaps valid cells.
     """
+    import inspect
+
+    amr = inspect.getmodule(self)
+
     index4 = _process_index(self, index)
 
     # Gather the data to be included in a list to be sent to other processes
@@ -503,17 +466,19 @@ def __getitem__(self, index, with_internal_ghosts=False):
             device_arr = _get_field(self, mfi)
             slice_arr = device_arr[block_slices]
             try:
-                # Copy data from host to device using cupy syntax
-                slice_arr = slice_arr.get()
+                if amr.Config.gpu_backend == "SYCL":
+                    import dpnp
+
+                    slice_arr = dpnp.asnumpy(slice_arr)
+                else:
+                    # Copy data from host to device using cupy syntax
+                    slice_arr = slice_arr.get()
             except AttributeError:
                 # Array is already a numpy array on the host
                 pass
             datalist.append((global_slices, slice_arr))
 
     # Gather the data from all processors
-    import inspect
-
-    amr = inspect.getmodule(self)
     if amr.Config.have_mpi:
         npes = amr.ParallelDescriptor.NProcs()
     else:
@@ -604,7 +569,10 @@ def __setitem__(self, index, value):
 
     amr = inspect.getmodule(self)
     if amr.Config.have_gpu:
-        import cupy as xp
+        if amr.Config.gpu_backend == "SYCL":
+            import dpnp as xp
+        else:
+            import cupy as xp
     else:
         xp = np
 
@@ -653,7 +621,6 @@ def register_MultiFab_extension(amr):
     amr.MultiFab.__iter__ = lambda mfab: amr.MFIter(mfab)
 
     amr.MultiFab.to_numpy = mf_to_numpy
-    amr.MultiFab.to_cupy = mf_to_cupy
     amr.MultiFab.to_xp = mf_to_xp
 
     amr.MultiFab.copy = lambda self: copy_multifab(amr, self)
@@ -669,7 +636,6 @@ def register_MultiFab_extension(amr):
     amr.iMultiFab.__iter__ = lambda imfab: amr.MFIter(imfab)
 
     amr.iMultiFab.to_numpy = mf_to_numpy
-    amr.iMultiFab.to_cupy = mf_to_cupy
     amr.iMultiFab.to_xp = mf_to_xp
 
     amr.iMultiFab.copy = lambda self: copy_multifab(amr, self)
diff --git a/src/dlpack/CMakeLists.txt b/src/dlpack/CMakeLists.txt
new file mode 100644
index 00000000..2a27f9a7
--- /dev/null
+++ b/src/dlpack/CMakeLists.txt
@@ -0,0 +1,6 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    target_sources(pyAMReX_${D}d
+        PRIVATE
+            DLPack.cpp
+    )
+endforeach()
diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp
new file mode 100644
index 00000000..29d806a9
--- /dev/null
+++ b/src/dlpack/DLPack.cpp
@@ -0,0 +1,35 @@
+#include "pyAMReX.H"
+
+#include "dlpack.h"
+
+
+void init_DLPack(py::module& m)
+{
+    using namespace amrex;
+
+    // register types only if not already present, e.g., from another library
+    // that also implements DLPack bindings and exposes the types
+
+    // TODO: py::type pyDLDeviceType = py::type::of<DLDeviceType>();
+    bool pyDLDeviceType = false;
+    if (!pyDLDeviceType) {
+        py::native_enum<DLDeviceType>(m, "DLDeviceType", "enum.IntEnum")
+            .value("kDLCPU", DLDeviceType::kDLCPU)
+            .value("kDLCUDA", DLDeviceType::kDLCUDA)
+            .value("kDLCUDAHost", DLDeviceType::kDLCUDAHost)
+            .value("kDLOpenCL", DLDeviceType::kDLOpenCL)
+            .value("kDLVulkan", DLDeviceType::kDLVulkan)
+            .value("kDLMetal", DLDeviceType::kDLMetal)
+            .value("kDLVPI", DLDeviceType::kDLVPI)
+            .value("kDLROCM", DLDeviceType::kDLROCM)
+            .value("kDLROCMHost", DLDeviceType::kDLROCMHost)
+            .value("kDLExtDev", DLDeviceType::kDLExtDev)
+            .value("kDLCUDAManaged", DLDeviceType::kDLCUDAManaged)
+            .value("kDLOneAPI", DLDeviceType::kDLOneAPI)
+            .value("kDLWebGPU", DLDeviceType::kDLWebGPU)
+            .value("kDLHexagon", DLDeviceType::kDLHexagon)
+            .value("kDLMAIA", DLDeviceType::kDLMAIA)
+        ;
+    }
+
+}
diff --git a/src/dlpack/dlpack.h b/src/dlpack/dlpack.h
new file mode 100644
index 00000000..e25fb19f
--- /dev/null
+++ b/src/dlpack/dlpack.h
@@ -0,0 +1,437 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ *
+ * Source: https://github.com/dmlc/dlpack/blob/v1.1/include/dlpack/dlpack.h
+ */
+#ifndef AMREX_DLPACK_H_
+#define AMREX_DLPACK_H_
+
+#include <AMReX_Extension.H>
+#include <AMReX_Gpu.H>
+
+#include <stdexcept>
+#include <type_traits>
+
+extern "C" {
+
+#include <stdint.h>
+#include <stddef.h>
+
+/*!
+ * \brief The DLPack version.
+ *
+ * A change in major version indicates that we have changed the
+ * data layout of the ABI - DLManagedTensorVersioned.
+ *
+ * A change in minor version indicates that we have added new
+ * code, such as a new device type, but the ABI is kept the same.
+ *
+ * If an obtained DLPack tensor has a major version that disagrees
+ * with the version number specified in this header file
+ * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+ * (and it is safe to do so). It is not safe to access any other fields
+ * as the memory layout will have changed.
+ *
+ * In the case of a minor version mismatch, the tensor can be safely used as
+ * long as the consumer knows how to interpret all fields. Minor version
+ * updates indicate the addition of enumeration values.
+ */
+typedef struct {
+  /*! \brief DLPack major version. */
+  uint32_t major = 1;
+  /*! \brief DLPack minor version. */
+  uint32_t minor = 1;
+} DLPackVersion;
+
+/*!
+ * \brief The device type in DLDevice.
+ */
+typedef enum : int32_t {
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
+  /*! \brief Microsoft MAIA devices */
+  kDLMAIA = 17,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
+  /*! \brief FP8 data types */
+  kDLFloat8_e3m4 = 7U,
+  kDLFloat8_e4m3 = 8U,
+  kDLFloat8_e4m3b11fnuz = 9U,
+  kDLFloat8_e4m3fn = 10U,
+  kDLFloat8_e4m3fnuz = 11U,
+  kDLFloat8_e5m2 = 12U,
+  kDLFloat8_e5m2fnuz = 13U,
+  kDLFloat8_e8m0fnu = 14U,
+  /*! \brief FP6 data types
+   * Setting bits != 6 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat6_e2m3fn = 15U,
+  kDLFloat6_e3m2fn = 16U,
+  /*! \brief FP4 data types
+   * Setting bits != 4 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat4_e2m1fn = 17U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+ *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
+ *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
+ *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
+ *
+ *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
+ *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   *
+   * Note that if the tensor is of size zero, then the data pointer should be
+   * set to `NULL`.
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+// bit masks used in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief bit mask to indicate that the tensor is a copy made by the producer.
+ *
+ * If set, the tensor is considered solely owned throughout its lifetime by the
+ * consumer, until the producer-provided deleter is invoked.
+ */
+#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
+
+/*
+ * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
+ *
+ * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
+ * be set by the producer to signal that a tensor of sub-byte type is padded.
+ */
+#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
+
+/*!
+ * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
+ *
+ * This data structure is intended to facilitate the borrowing of DLTensor by
+ * another framework. It is not meant to transfer the tensor. When the borrowing
+ * framework doesn't need the tensor, it should call the deleter to notify the
+ * host that the resource is no longer needed.
+ *
+ * \note This is the current standard DLPack exchange data structure.
+ */
+struct DLManagedTensorVersioned {
+  /*!
+   * \brief The API and ABI version of the current managed Tensor
+   */
+  DLPackVersion version;
+  /*!
+   * \brief the context of the original host framework.
+   *
+   * Stores DLManagedTensorVersioned is used in the
+   * framework. It can also be NULL.
+   */
+  void *manager_ctx;
+  /*!
+   * \brief Destructor.
+   *
+   * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned.
+   * It can be NULL if there is no way for the caller to provide a reasonable
+   * destructor. The destructor deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensorVersioned *self);
+  /*!
+   * \brief Additional bitmask flags information about the tensor.
+   *
+   * By default the flags should be set to 0.
+   *
+   * \note Future ABI changes should keep everything until this field
+   *       stable, to ensure that deleter can be correctly called.
+   *
+   * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+   * \sa DLPACK_FLAG_BITMASK_IS_COPIED
+   */
+  uint64_t flags;
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+};
+
+} // extern "C"
+
+namespace pyAMReX::dlpack
+{
+
+    template<typename T>
+    AMREX_INLINE
+    DLDataType get_dlpack_dtype ()
+    {
+        using V = std::decay_t<T>;
+        DLDataType dtype{};
+
+        if constexpr (std::is_same_v<V, float>) {
+            dtype.code = kDLFloat;
+            dtype.bits = 32;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<V, double>) {
+            dtype.code = kDLFloat;
+            dtype.bits = 64;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<V, int32_t>) {
+            dtype.code = kDLInt;
+            dtype.bits = 32;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<V, int64_t>) {
+            dtype.code = kDLInt;
+            dtype.bits = 64;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<V, uint32_t>) {
+            dtype.code = kDLUInt;
+            dtype.bits = 32;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<V, uint64_t>) {
+            dtype.code = kDLUInt;
+            dtype.bits = 64;
+            dtype.lanes = 1;
+        }
+        else {
+            throw std::runtime_error("Unsupported dtype for DLPack");
+        }
+
+        return dtype;
+    }
+
+    AMREX_INLINE
+    DLDevice detect_device_from_pointer ([[maybe_unused]] const void* ptr)
+    {
+        DLDevice device{ kDLCPU, 0 };
+
+#ifdef AMREX_USE_CUDA
+        // Check if data is on GPU by checking if pointer is in CUDA memory
+        // note: cudaPointerGetAttributes is quite expensive, remove and
+        //       assume device-side if need be.
+        cudaPointerAttributes attr;
+        cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
+        if (err == cudaSuccess && attr.type == cudaMemoryTypeDevice) {
+            device.device_type = kDLCUDA;
+            device.device_id = attr.device;
+        }
+#elif defined(AMREX_USE_HIP)
+        // Check if data is on GPU by checking if pointer is in HIP memory
+        // note: hipPointerGetAttributes is quite expensive, remove and
+        //       assume device-side if need be.
+        hipPointerAttribute_t attr;
+        hipError_t err = hipPointerGetAttributes(&attr, ptr);
+        if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
+            device.device_type = kDLROCM;
+            device.device_id = attr.device;
+        }
+
+#elif defined(AMREX_USE_DPCPP)
+        // try {
+            // Get the SYCL context and queue from AMReX
+            auto const& queue = amrex::Gpu::Device::streamQueue();
+            auto const& context = queue.get_context();
+
+            // Try to get pointer attributes using SYCL USM queries
+            auto usm_type = sycl::get_pointer_type(ptr, context);
+
+            if (usm_type == sycl::usm::alloc::device ||
+                usm_type == sycl::usm::alloc::shared) {
+                device.device_type = kDLOneAPI;
+
+                // Try to get the actual device from the pointer
+                try {
+                    auto device_ptr = sycl::get_pointer_device(ptr, context);
+                    device.device_id = 0;  // Default to first device
+
+                    auto devices = context.get_devices();
+                    for (size_t i = 0; i < devices.size(); ++i) {
+                        if (devices[i] == device_ptr) {
+                            device.device_id = static_cast<int32_t>(i);
+                            break;
+                        }
+                    }
+                } catch (const sycl::exception&) {
+                    // If we can't determine the specific device, default to 0
+                    device.device_id = 0;
+                }
+            } else if (usm_type == sycl::usm::alloc::host) {
+                // Host USM allocation - still oneAPI but accessible from host
+                device.device_type = kDLOneAPI;
+                device.device_id = 0;
+            }
+            // If usm_type is sycl::usm::alloc::unknown, it might be regular CPU memory
+            // In that case, we keep the default CPU device type set above
+
+        /*
+        }
+        catch (const sycl::exception&) {
+            // If SYCL queries fail, assume it's regular CPU memory
+            // device remains as kDLCPU, 0 (set at function start)
+        } catch (...) {
+            // Handle any other exceptions gracefully
+            // device remains as kDLCPU, 0 (set at function start)
+        }
+        */
+#endif
+
+        return device;
+    }
+
+} // namespace pyAMReX::dlpack
+
+#endif // AMREX_DLPACK_H_
diff --git a/src/pyAMReX.cpp b/src/pyAMReX.cpp
index 36ce03d0..ab9ccfa9 100644
--- a/src/pyAMReX.cpp
+++ b/src/pyAMReX.cpp
@@ -20,6 +20,7 @@ void init_Arena(py::module&);
 void init_Array4(py::module&);
 void init_BaseFab(py::module&);
 void init_Box(py::module &);
+void init_DLPack(py::module &);
 void init_RealBox(py::module &);
 void init_BoxArray(py::module &);
 void init_CoordSys(py::module&);
@@ -98,6 +99,7 @@ PYBIND11_MODULE(amrex_3d_pybind, m) {
 
     // note: order from parent to child classes and argument usage
 
+    init_DLPack(m);
     init_AMReX(m);
     init_Arena(m);
     init_Dim3(m);
diff --git a/tests/test_array4.py b/tests/test_array4.py
index 99c37d12..44570837 100644
--- a/tests/test_array4.py
+++ b/tests/test_array4.py
@@ -31,7 +31,9 @@ def test_array4():
     )
     print(f"\nx: {x.__array_interface__} {x.dtype}")
     arr = amr.Array4_double(x)
-    print(f"arr: {arr.__array_interface__}")
+    print(f"arr: DLPack device info: {arr.__dlpack_device__()}")
+    # print(f"arr: DLPack: {arr.__dlpack__()}")
+    print(f"x.shape: {x.shape}")
     print(arr)
     assert arr.nComp == 1
 
@@ -44,16 +46,16 @@ def test_array4():
     assert arr[0, 0, 0] == 1
     assert arr[3, 2, 1] == 1
 
-    # copy to numpy
-    c_arr2np = np.array(arr, copy=True)  # segfaults on Windows
+    # copy to numpy using DLPack
+    c_arr2np = np.from_dlpack(arr)
     assert c_arr2np.ndim == 4
     assert c_arr2np.dtype == np.dtype("double")
     print(f"c_arr2np: {c_arr2np.__array_interface__}")
     np.testing.assert_array_equal(x, c_arr2np[0, :, :, :])
     assert c_arr2np[0, 1, 1, 1] == 42
 
-    # view to numpy
-    v_arr2np = np.array(arr, copy=False)
+    # view to numpy using DLPack
+    v_arr2np = np.from_dlpack(arr)
     assert c_arr2np.ndim == 4
     assert v_arr2np.dtype == np.dtype("double")
     np.testing.assert_array_equal(x, v_arr2np[0, :, :, :])
@@ -65,7 +67,7 @@ def test_array4():
 
     # copy array4 (view)
     c_arr = amr.Array4_double(arr)
-    v_carr2np = np.array(c_arr, copy=False)
+    v_carr2np = np.from_dlpack(c_arr)
     x[1, 1, 1] = 44
     assert v_carr2np[0, 1, 1, 1] == 44