diff --git a/CMakeLists.txt b/CMakeLists.txt
index 067d7d9435..e8067bda25 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@ option(MGARD_ENABLE_OPENMP "Enable OpenMP support." OFF)
 option(MGARD_ENABLE_CUDA "Enable CUDA support" OFF)
 option(MGARD_ENABLE_SERIAL "Enable SERIAL support" ON)
 option(MGARD_ENABLE_HIP "Enable HIP support" OFF)
+option(MGARD_ENABLE_SYCL "Enable SYCL support" OFF)
 option(MGARD_ENABLE_LEGACY_CUDA "Enable legacy CUDA support" OFF)
 
 option(MGARD_ENABLE_CLI "Build executable." OFF)
@@ -88,11 +89,18 @@ endif()
 
 if (MGARD_ENABLE_SERIAL OR 
     MGARD_ENABLE_CUDA   OR
-    MGARD_ENABLE_HIP    )
+    MGARD_ENABLE_HIP    OR
+    MGARD_ENABLE_SYCL)
+
+  if(MGARD_ENABLE_SYCL)
+    set(MGARD_ENABLE_SERIAL OFF)
+  endif()
 
   set (CMAKE_CXX_STANDARD 17)
   set (CMAKE_CXX_STANDARD_REQUIRED ON)
   set (CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -w")
+
+  include(MgardXGenerateSource)
   add_subdirectory (src/mgard-x)
   add_subdirectory (include/mgard-x)
   add_subdirectory (include/mgard-x/MDR)
@@ -178,6 +186,12 @@ if (MGARD_ENABLE_HIP)
   set_source_files_properties(${MGARD_X_HIP_SRC} PROPERTIES LANGUAGE HIP)
 endif()
 
+if (MGARD_ENABLE_SYCL)
+  # No need to link with sycl libraries for now
+  # find_package(IntelDPCPP REQUIRED)
+  set (CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -w")
+endif()
+
 find_package(ZLIB REQUIRED)
 find_package(zstd)
 
@@ -271,6 +285,7 @@ target_sources(
   ${MGARD_X_SERIAL_SRC}
   ${MGARD_X_CUDA_SRC}
   ${MGARD_X_HIP_SRC}
+  ${MGARD_X_SYCL_SRC}
 )
 
 set_target_properties(mgard-library PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
@@ -450,6 +465,7 @@ if (MGARD_ENABLE_HIP)
   message(STATUS "HIP Arch: ${CMAKE_HIP_ARCHITECTURES}")
   list(POP_BACK CMAKE_MESSAGE_INDENT)
 endif()
+message(STATUS "SYCL:        ${MGARD_ENABLE_SYCL}")
 list(POP_BACK CMAKE_MESSAGE_INDENT)
 message(STATUS "LEGACY CUDA:   ${MGARD_ENABLE_LEGACY_CUDA}")
 if (MGARD_ENABLE_LEGACY_CUDA)
diff --git a/build_scripts/build_mgard_cuda_legacy.sh b/build_scripts/build_mgard_cuda_legacy.sh
new file mode 100755
index 0000000000..3663c8b4c1
--- /dev/null
+++ b/build_scripts/build_mgard_cuda_legacy.sh
@@ -0,0 +1,82 @@
+#!/bin/sh
+
+# Copyright 2021, Oak Ridge National Laboratory.
+# MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+# Author: Jieyang Chen (chenj3@ornl.gov)
+# Date: April 2, 2021
+# Script for building MGARD-X
+
+set -e
+set -x
+
+######## User Configurations ########
+# Source directory
+mgard_x_src_dir=.
+# Build directory
+build_dir=./build-cuda-turing
+# Number of processors used for building
+num_build_procs=8
+# Installtaion directory
+install_dir=./install-cuda-turing
+
+
+#build NVCOMP
+nvcomp_dir=${build_dir}/nvcomp
+nvcomp_src_dir=${nvcomp_dir}/src
+nvcomp_build_dir=${nvcomp_dir}/build
+nvcomp_install_dir=${install_dir}
+if [ ! -d "${nvcomp_src_dir}" ]; then
+  git clone -b v2.2.0 https://github.com/NVIDIA/nvcomp.git ${nvcomp_src_dir}
+fi
+mkdir -p ${nvcomp_build_dir}
+cmake -S ${nvcomp_src_dir} -B ${nvcomp_build_dir}\
+    -DCMAKE_INSTALL_PREFIX=${nvcomp_install_dir}
+cmake --build ${nvcomp_build_dir} -j ${num_build_procs}
+cmake --install ${nvcomp_build_dir}
+
+#build ZSTD
+zstd_dir=${build_dir}/zstd
+zstd_src_dir=${zstd_dir}/src
+zstd_build_dir=${zstd_dir}/build
+zstd_install_dir=${install_dir}
+if [ ! -d "${zstd_src_dir}" ]; then
+  git clone -b v1.5.0 https://github.com/facebook/zstd.git ${zstd_src_dir}
+fi
+mkdir -p ${zstd_build_dir}
+cmake -S ${zstd_src_dir}/build/cmake -B ${zstd_build_dir}\
+    -DZSTD_MULTITHREAD_SUPPORT=ON\
+    -DCMAKE_INSTALL_LIBDIR=lib\
+    -DCMAKE_INSTALL_PREFIX=${zstd_install_dir}
+cmake --build ${zstd_build_dir} -j ${num_build_procs}
+cmake --install ${zstd_build_dir}
+
+#build Protobuf
+protobuf_dir=${build_dir}/protobuf
+protobuf_src_dir=${protobuf_dir}/src
+protobuf_build_dir=${protobuf_dir}/build
+protobuf_install_dir=${install_dir}
+if [ ! -d "${protobuf_src_dir}" ]; then
+  git clone -b v3.19.4 --recurse-submodules https://github.com/protocolbuffers/protobuf.git ${protobuf_src_dir}
+fi
+mkdir -p ${protobuf_build_dir}
+cmake -S ${protobuf_src_dir}/cmake -B ${protobuf_build_dir}\
+    -Dprotobuf_BUILD_SHARED_LIBS=ON\
+    -DCMAKE_INSTALL_PREFIX=${protobuf_install_dir}
+cmake --build ${protobuf_build_dir} -j ${num_build_procs}
+cmake --install ${protobuf_build_dir}
+
+
+#build MGARD
+mgard_x_build_dir=${build_dir}/mgard
+mgard_x_install_dir=${install_dir}
+mkdir -p ${mgard_x_build_dir}
+cmake -S ${mgard_x_src_dir} -B ${mgard_x_build_dir} \
+    -DCMAKE_PREFIX_PATH="${nvcomp_install_dir};${zstd_install_dir}/lib/cmake/zstd;${protobuf_install_dir}"\
+    -DMGARD_ENABLE_SERIAL=OFF\
+    -DMGARD_ENABLE_LEGACY_CUDA=ON\
+    -DCMAKE_CUDA_ARCHITECTURES="75"\
+    -DMGARD_ENABLE_DOCS=OFF\
+    -DCMAKE_BUILD_TYPE=Release\
+    -DCMAKE_INSTALL_PREFIX=${mgard_x_install_dir}
+cmake --build ${mgard_x_build_dir} -j ${num_build_procs}
+cmake --install ${mgard_x_build_dir}
diff --git a/build_scripts/build_mgard_cuda_turing.sh b/build_scripts/build_mgard_cuda_turing.sh
index 181398a53e..788b1d483b 100755
--- a/build_scripts/build_mgard_cuda_turing.sh
+++ b/build_scripts/build_mgard_cuda_turing.sh
@@ -15,7 +15,7 @@ mgard_x_src_dir=.
 # Build directory
 build_dir=./build-cuda-turing
 # Number of processors used for building
-num_build_procs=8
+num_build_procs=16
 # Installtaion directory
 install_dir=./install-cuda-turing
 
diff --git a/build_scripts/build_mgard_sycl_gen9.sh b/build_scripts/build_mgard_sycl_gen9.sh
new file mode 100755
index 0000000000..00d0620859
--- /dev/null
+++ b/build_scripts/build_mgard_sycl_gen9.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+
+# Copyright 2021, Oak Ridge National Laboratory.
+# MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+# Author: Jieyang Chen (chenj3@ornl.gov)
+# Date: April 2, 2021
+# Script for building MGARD-X
+
+set -e
+set -x
+
+######## User Configurations ########
+# Source directory
+mgard_x_src_dir=.
+# Build directory
+build_dir=./build-sycl-gen9
+# Number of processors used for building
+num_build_procs=8
+# Installtaion directory
+install_dir=./install-sycl-gen9
+
+
+#build ZSTD
+zstd_dir=${build_dir}/zstd
+zstd_src_dir=${zstd_dir}/src
+zstd_build_dir=${zstd_dir}/build
+zstd_install_dir=${install_dir}
+if [ ! -d "${zstd_src_dir}" ]; then
+  git clone -b v1.5.0 https://github.com/facebook/zstd.git ${zstd_src_dir}
+fi
+mkdir -p ${zstd_build_dir}
+cmake -S ${zstd_src_dir}/build/cmake -B ${zstd_build_dir}\
+    -DZSTD_MULTITHREAD_SUPPORT=ON\
+    -DCMAKE_INSTALL_LIBDIR=lib\
+    -DCMAKE_CXX_COMPILER=icpx\
+    -DCMAKE_C_COMPILER=icx\
+    -DCMAKE_INSTALL_PREFIX=${zstd_install_dir}
+cmake --build ${zstd_build_dir} -j ${num_build_procs}
+cmake --install ${zstd_build_dir}
+
+#build Protobuf
+protobuf_dir=${build_dir}/protobuf
+protobuf_src_dir=${protobuf_dir}/src
+protobuf_build_dir=${protobuf_dir}/build
+protobuf_install_dir=${install_dir}
+if [ ! -d "${protobuf_src_dir}" ]; then
+  git clone -b v3.19.4 --recurse-submodules https://github.com/protocolbuffers/protobuf.git ${protobuf_src_dir}
+fi
+mkdir -p ${protobuf_build_dir}
+cmake -S ${protobuf_src_dir}/cmake -B ${protobuf_build_dir}\
+    -Dprotobuf_BUILD_SHARED_LIBS=ON\
+    -DCMAKE_CXX_COMPILER=icpx\
+    -DCMAKE_C_COMPILER=icx\
+    -DCMAKE_INSTALL_PREFIX=${protobuf_install_dir}
+cmake --build ${protobuf_build_dir} -j ${num_build_procs}
+cmake --install ${protobuf_build_dir}
+
+
+#build MGARD
+mgard_x_build_dir=${build_dir}/mgard
+mgard_x_install_dir=${install_dir}
+mkdir -p ${mgard_x_build_dir}
+cmake -S ${mgard_x_src_dir} -B ${mgard_x_build_dir} \
+    -DCMAKE_PREFIX_PATH="${zstd_install_dir}/lib/cmake/zstd;${protobuf_install_dir}"\
+    -DMGARD_ENABLE_SERIAL=OFF\
+    -DMGARD_ENABLE_SYCL=ON\
+    -DCMAKE_CXX_COMPILER=icpx\
+    -DCMAKE_C_COMPILER=icx\
+    -DCMAKE_CXX_FLAGS="-O2 -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend \"-device gen9\""\
+    -DMGARD_ENABLE_DOCS=OFF\
+    -DCMAKE_BUILD_TYPE=Release\
+    -DCMAKE_INSTALL_PREFIX=${mgard_x_install_dir}
+cmake --build ${mgard_x_build_dir} -j ${num_build_procs}
+cmake --install ${mgard_x_build_dir}
diff --git a/build_scripts/build_mgard_sycl_x86.sh b/build_scripts/build_mgard_sycl_x86.sh
new file mode 100755
index 0000000000..4076053883
--- /dev/null
+++ b/build_scripts/build_mgard_sycl_x86.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+
+# Copyright 2021, Oak Ridge National Laboratory.
+# MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+# Author: Jieyang Chen (chenj3@ornl.gov)
+# Date: April 2, 2021
+# Script for building MGARD-X
+
+set -e
+set -x
+
+######## User Configurations ########
+# Source directory
+mgard_x_src_dir=.
+# Build directory
+build_dir=./build-sycl-x86
+# Number of processors used for building
+num_build_procs=8
+# Installtaion directory
+install_dir=./install-sycl-x86
+
+
+#build ZSTD
+zstd_dir=${build_dir}/zstd
+zstd_src_dir=${zstd_dir}/src
+zstd_build_dir=${zstd_dir}/build
+zstd_install_dir=${install_dir}
+if [ ! -d "${zstd_src_dir}" ]; then
+  git clone -b v1.5.0 https://github.com/facebook/zstd.git ${zstd_src_dir}
+fi
+mkdir -p ${zstd_build_dir}
+cmake -S ${zstd_src_dir}/build/cmake -B ${zstd_build_dir}\
+    -DZSTD_MULTITHREAD_SUPPORT=ON\
+    -DCMAKE_INSTALL_LIBDIR=lib\
+    -DCMAKE_CXX_COMPILER=icpx\
+    -DCMAKE_C_COMPILER=icx\
+    -DCMAKE_INSTALL_PREFIX=${zstd_install_dir}
+cmake --build ${zstd_build_dir} -j ${num_build_procs}
+cmake --install ${zstd_build_dir}
+
+#build Protobuf
+protobuf_dir=${build_dir}/protobuf
+protobuf_src_dir=${protobuf_dir}/src
+protobuf_build_dir=${protobuf_dir}/build
+protobuf_install_dir=${install_dir}
+if [ ! -d "${protobuf_src_dir}" ]; then
+  git clone -b v3.19.4 --recurse-submodules https://github.com/protocolbuffers/protobuf.git ${protobuf_src_dir}
+fi
+mkdir -p ${protobuf_build_dir}
+cmake -S ${protobuf_src_dir}/cmake -B ${protobuf_build_dir}\
+    -Dprotobuf_BUILD_SHARED_LIBS=ON\
+    -DCMAKE_CXX_COMPILER=icpx\
+    -DCMAKE_C_COMPILER=icx\
+    -DCMAKE_INSTALL_PREFIX=${protobuf_install_dir}
+cmake --build ${protobuf_build_dir} -j ${num_build_procs}
+cmake --install ${protobuf_build_dir}
+
+
+#build MGARD
+mgard_x_build_dir=${build_dir}/mgard
+mgard_x_install_dir=${install_dir}
+mkdir -p ${mgard_x_build_dir}
+cmake -S ${mgard_x_src_dir} -B ${mgard_x_build_dir} \
+    -DCMAKE_PREFIX_PATH="${zstd_install_dir}/lib/cmake/zstd;${protobuf_install_dir}"\
+    -DMGARD_ENABLE_SERIAL=OFF\
+    -DMGARD_ENABLE_SYCL=ON\
+    -DCMAKE_CXX_COMPILER=icpx\
+    -DCMAKE_C_COMPILER=icx\
+    -DCMAKE_CXX_FLAGS="-fsycl -fsycl-targets=x86_64"\
+    -DMGARD_ENABLE_DOCS=OFF\
+    -DCMAKE_BUILD_TYPE=Release\
+    -DCMAKE_INSTALL_PREFIX=${mgard_x_install_dir}
+cmake --build ${mgard_x_build_dir} -j ${num_build_procs}
+cmake --install ${mgard_x_build_dir}
diff --git a/build_scripts/build_mgard_sycl_xehp.sh b/build_scripts/build_mgard_sycl_xehp.sh
new file mode 100755
index 0000000000..5e23c299ed
--- /dev/null
+++ b/build_scripts/build_mgard_sycl_xehp.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+
+# Copyright 2021, Oak Ridge National Laboratory.
+# MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+# Author: Jieyang Chen (chenj3@ornl.gov)
+# Date: April 2, 2021
+# Script for building MGARD-X
+
+set -e
+set -x
+
+######## User Configurations ########
+# Source directory
+mgard_x_src_dir=.
+# Build directory
+build_dir=./build-sycl-xehp
+# Number of processors used for building
+num_build_procs=8
+# Installtaion directory
+install_dir=./install-sycl-xehp
+
+
+#build ZSTD
+zstd_dir=${build_dir}/zstd
+zstd_src_dir=${zstd_dir}/src
+zstd_build_dir=${zstd_dir}/build
+zstd_install_dir=${install_dir}
+if [ ! -d "${zstd_src_dir}" ]; then
+  git clone -b v1.5.0 https://github.com/facebook/zstd.git ${zstd_src_dir}
+fi
+mkdir -p ${zstd_build_dir}
+cmake -S ${zstd_src_dir}/build/cmake -B ${zstd_build_dir}\
+    -DZSTD_MULTITHREAD_SUPPORT=ON\
+    -DCMAKE_INSTALL_LIBDIR=lib\
+    -DCMAKE_CXX_COMPILER=icpx\
+    -DCMAKE_C_COMPILER=icx\
+    -DCMAKE_INSTALL_PREFIX=${zstd_install_dir}
+cmake --build ${zstd_build_dir} -j ${num_build_procs}
+cmake --install ${zstd_build_dir}
+
+#build Protobuf
+protobuf_dir=${build_dir}/protobuf
+protobuf_src_dir=${protobuf_dir}/src
+protobuf_build_dir=${protobuf_dir}/build
+protobuf_install_dir=${install_dir}
+if [ ! -d "${protobuf_src_dir}" ]; then
+  git clone -b v3.19.4 --recurse-submodules https://github.com/protocolbuffers/protobuf.git ${protobuf_src_dir}
+fi
+mkdir -p ${protobuf_build_dir}
+cmake -S ${protobuf_src_dir}/cmake -B ${protobuf_build_dir}\
+    -Dprotobuf_BUILD_SHARED_LIBS=ON\
+    -DCMAKE_CXX_COMPILER=icpx\
+    -DCMAKE_C_COMPILER=icx\
+    -DCMAKE_INSTALL_PREFIX=${protobuf_install_dir}
+cmake --build ${protobuf_build_dir} -j ${num_build_procs}
+cmake --install ${protobuf_build_dir}
+
+
+#build MGARD
+mgard_x_build_dir=${build_dir}/mgard
+mgard_x_install_dir=${install_dir}
+mkdir -p ${mgard_x_build_dir}
+cmake -S ${mgard_x_src_dir} -B ${mgard_x_build_dir} \
+    -DCMAKE_PREFIX_PATH="${zstd_install_dir}/lib/cmake/zstd;${protobuf_install_dir}"\
+    -DMGARD_ENABLE_SERIAL=OFF\
+    -DMGARD_ENABLE_SYCL=ON\
+    -DCMAKE_CXX_COMPILER=icpx\
+    -DCMAKE_C_COMPILER=icx\
+    -DCMAKE_CXX_FLAGS="-O2 -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend \"-device xehp\""\
+    -DMGARD_ENABLE_DOCS=OFF\
+    -DCMAKE_BUILD_TYPE=Release\
+    -DCMAKE_INSTALL_PREFIX=${mgard_x_install_dir}
+cmake --build ${mgard_x_build_dir} -j ${num_build_procs}
+cmake --install ${mgard_x_build_dir}
diff --git a/cmake/MgardXGenerateSource.cmake b/cmake/MgardXGenerateSource.cmake
new file mode 100644
index 0000000000..941d152102
--- /dev/null
+++ b/cmake/MgardXGenerateSource.cmake
@@ -0,0 +1,76 @@
+if (MGARD_ENABLE_SERIAL)
+  list(APPEND DEVICE_TYPE_LIST SERIAL)
+endif()
+if (MGARD_ENABLE_CUDA)
+  list(APPEND DEVICE_TYPE_LIST CUDA)
+endif()
+if (MGARD_ENABLE_HIP)
+  list(APPEND DEVICE_TYPE_LIST HIP)
+endif()
+if (MGARD_ENABLE_SYCL)
+  list(APPEND DEVICE_TYPE_LIST SYCL)
+endif()
+
+set(DATA_TYPE_LIST double float)
+set(NUM_DIM_LIST 1 2 3 4 5)
+  
+function(MgardXGenerateSourceAllCombinations src_file_prefix)
+  foreach(DEVICE_TYPE IN LISTS DEVICE_TYPE_LIST)
+    foreach(DATA_TYPE IN LISTS DATA_TYPE_LIST)
+      foreach(NUM_DIM IN LISTS NUM_DIM_LIST)
+        set(SRC_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${src_file_prefix}.cpp.in")
+        set(GEN_FILE "${CMAKE_CURRENT_BINARY_DIR}/${src_file_prefix}_${NUM_DIM}D_${DATA_TYPE}_${DEVICE_TYPE}.cpp")
+        configure_file(${SRC_FILE} ${GEN_FILE})
+        if (${DEVICE_TYPE} STREQUAL "SERIAL")
+          set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CXX)
+          list(APPEND MGARD_X_SERIAL_SRC ${GEN_FILE})
+        endif()
+        if (${DEVICE_TYPE} STREQUAL "CUDA")
+          set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CUDA)
+          list(APPEND MGARD_X_CUDA_SRC ${GEN_FILE})
+        endif()
+        if (${DEVICE_TYPE} STREQUAL "HIP")
+          set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE HIP)
+          list(APPEND MGARD_X_HIP_SRC ${GEN_FILE})
+        endif()
+        if (${DEVICE_TYPE} STREQUAL "SYCL")
+          set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CXX)
+          list(APPEND MGARD_X_SYCL_SRC ${GEN_FILE})
+        endif()
+        
+      endforeach()
+    endforeach()
+  endforeach()
+  set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+  set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+  set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+  set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
+endfunction()
+
+function(MgardXGenerateSourceAllDevices src_file_prefix)
+  foreach(DEVICE_TYPE IN LISTS DEVICE_TYPE_LIST)
+    set(SRC_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${src_file_prefix}.cpp.in")
+    set(GEN_FILE "${CMAKE_CURRENT_BINARY_DIR}/${src_file_prefix}_${DEVICE_TYPE}.cpp")
+    configure_file(${SRC_FILE} ${GEN_FILE})
+    if (${DEVICE_TYPE} STREQUAL "SERIAL")
+      set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CXX)
+      list(APPEND MGARD_X_SERIAL_SRC ${GEN_FILE})
+    endif()
+    if (${DEVICE_TYPE} STREQUAL "CUDA")
+      set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CUDA)
+      list(APPEND MGARD_X_CUDA_SRC ${GEN_FILE})
+    endif()
+    if (${DEVICE_TYPE} STREQUAL "HIP")
+      set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE HIP)
+      list(APPEND MGARD_X_HIP_SRC ${GEN_FILE})
+    endif()
+    if (${DEVICE_TYPE} STREQUAL "SYCL")
+      set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CXX)
+      list(APPEND MGARD_X_SYCL_SRC ${GEN_FILE})
+    endif()
+  endforeach()
+  set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+  set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+  set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+  set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
+endfunction()
\ No newline at end of file
diff --git a/examples/mgard-x/BatchTests/BatchTests.cpp b/examples/mgard-x/BatchTests/BatchTests.cpp
index 705c051d80..8dfd7542fa 100644
--- a/examples/mgard-x/BatchTests/BatchTests.cpp
+++ b/examples/mgard-x/BatchTests/BatchTests.cpp
@@ -146,7 +146,7 @@ void compression(std::vector<mgard_x::SIZE> shape, enum device dev, T tol, T s,
 
     enum mgard_x::device_type dev_type;
     if (dev == X_Serial) {
-      dev_type = mgard_x::device_type::Serial;
+      dev_type = mgard_x::device_type::SERIAL;
     } else if (dev == X_CUDA) {
       dev_type = mgard_x::device_type::CUDA;
     } else if (dev == X_HIP) {
@@ -211,7 +211,7 @@ void decompression(std::vector<mgard_x::SIZE> shape, enum device dev, T tol,
 
     enum mgard_x::device_type dev_type;
     if (dev == X_Serial) {
-      dev_type = mgard_x::device_type::Serial;
+      dev_type = mgard_x::device_type::SERIAL;
     } else if (dev == X_CUDA) {
       dev_type = mgard_x::device_type::CUDA;
     } else if (dev == X_HIP) {
@@ -349,13 +349,13 @@ int main(int argc, char *argv[]) {
   dev2 = argv[i++];
 
   enum device device_type1, device_type2;
-  enum mgard_x::device_type dev_type = mgard_x::device_type::None;
+  enum mgard_x::device_type dev_type = mgard_x::device_type::NONE;
 
   std::cout << "Device1: ";
   if (strcmp(dev1, "x-serial") == 0) {
-    dev_type = mgard_x::device_type::Serial;
+    dev_type = mgard_x::device_type::SERIAL;
     device_type1 = device::X_Serial;
-    std::cout << "MGARD-X::Serial\n";
+    std::cout << "MGARD-X::SERIAL\n";
   } else if (strcmp(dev1, "x-cuda") == 0) {
     dev_type = mgard_x::device_type::CUDA;
     device_type1 = device::X_CUDA;
@@ -378,9 +378,9 @@ int main(int argc, char *argv[]) {
 
   std::cout << "Device2: ";
   if (strcmp(dev2, "x-serial") == 0) {
-    dev_type = mgard_x::device_type::Serial;
+    dev_type = mgard_x::device_type::SERIAL;
     device_type2 = device::X_Serial;
-    std::cout << "MGARD-X::Serial\n";
+    std::cout << "MGARD-X::SERIAL\n";
   } else if (strcmp(dev2, "x-cuda") == 0) {
     dev_type = mgard_x::device_type::CUDA;
     device_type2 = device::X_CUDA;
diff --git a/examples/mgard-x/HighLevelAPIs/Example.cpp b/examples/mgard-x/HighLevelAPIs/Example.cpp
index 1c608d93b4..94b5aeda3e 100644
--- a/examples/mgard-x/HighLevelAPIs/Example.cpp
+++ b/examples/mgard-x/HighLevelAPIs/Example.cpp
@@ -29,10 +29,10 @@ int main() {
   std::cout << "Done\n";
 
   std::cout
-      << "Decompressing with MGARD-X High level API with Serial backend...";
+      << "Decompressing with MGARD-X High level API with SERIAL backend...";
   // decompression
   void *decompressed_array_cpu = NULL;
-  config.dev_type = mgard_x::device_type::Serial;
+  config.dev_type = mgard_x::device_type::SERIAL;
   mgard_x::decompress(compressed_array_cpu, compressed_size,
                       decompressed_array_cpu, config, false);
 
diff --git a/examples/mgard-x/LowLevelAPIs/README.md b/examples/mgard-x/LowLevelAPIs/README.md
index 858ecced3c..2b8c445f0f 100644
--- a/examples/mgard-x/LowLevelAPIs/README.md
+++ b/examples/mgard-x/LowLevelAPIs/README.md
@@ -1,7 +1,7 @@
 # Compressing with MGARD-X Low-level APIs
 
 First, build and install MGARD-X.
-Then, run the following in `examples/mgard-x/LowLevelAPIs/Serial`, `examples/mgard-x/LowLevelAPIs/CUDA`, `examples/mgard-x/LowLevelAPIs/HIP`. Each folder contains a CMake project dedicated for a different kind of processor.
+Then, run the following in `examples/mgard-x/LowLevelAPIs/SERIAL`, `examples/mgard-x/LowLevelAPIs/CUDA`, `examples/mgard-x/LowLevelAPIs/HIP`. Each folder contains a CMake project dedicated for a different kind of processor.
 
 Build with CMake as follows or use the 'build_scripts.sh'.
 ```console
diff --git a/examples/mgard-x/LowLevelAPIs/Serial/CMakeLists.txt b/examples/mgard-x/LowLevelAPIs/SERIAL/CMakeLists.txt
similarity index 100%
rename from examples/mgard-x/LowLevelAPIs/Serial/CMakeLists.txt
rename to examples/mgard-x/LowLevelAPIs/SERIAL/CMakeLists.txt
diff --git a/examples/mgard-x/LowLevelAPIs/Serial/Example.cpp b/examples/mgard-x/LowLevelAPIs/SERIAL/Example.cpp
similarity index 74%
rename from examples/mgard-x/LowLevelAPIs/Serial/Example.cpp
rename to examples/mgard-x/LowLevelAPIs/SERIAL/Example.cpp
index d4b6fcaff5..1c2f22c015 100644
--- a/examples/mgard-x/LowLevelAPIs/Serial/Example.cpp
+++ b/examples/mgard-x/LowLevelAPIs/SERIAL/Example.cpp
@@ -13,16 +13,16 @@ int main() {
   double *in_array_cpu = new double[n1 * n2 * n3];
   //... load data into in_array_cpu
   std::vector<mgard_x::SIZE> shape{n1, n2, n3};
-  mgard_x::Hierarchy<3, double, mgard_x::Serial> hierarchy(shape);
-  mgard_x::Array<3, double, mgard_x::Serial> in_array(shape);
+  mgard_x::Hierarchy<3, double, mgard_x::SERIAL> hierarchy(shape);
+  mgard_x::Array<3, double, mgard_x::SERIAL> in_array(shape);
   in_array.load(in_array_cpu);
   std::cout << "Done\n";
 
-  std::cout << "Compressing with MGARD-X Serial backend...";
+  std::cout << "Compressing with MGARD-X SERIAL backend...";
   double tol = 0.01, s = 0, norm;
   mgard_x::Config config;
   config.lossless = mgard_x::lossless_type::Huffman_Zstd;
-  mgard_x::Array<1, unsigned char, mgard_x::Serial> compressed_array =
+  mgard_x::Array<1, unsigned char, mgard_x::SERIAL> compressed_array =
       mgard_x::compress(hierarchy, in_array, mgard_x::error_bound_type::REL,
                         tol, s, norm, config);
   // Get compressed size in number of bytes.
@@ -30,9 +30,9 @@ int main() {
   unsigned char *compressed_array_cpu = compressed_array.hostCopy();
   std::cout << "Done\n";
 
-  std::cout << "Decompressing with MGARD-X Serial backend...";
+  std::cout << "Decompressing with MGARD-X SERIAL backend...";
   // decompression
-  mgard_x::Array<3, double, mgard_x::Serial> decompressed_array =
+  mgard_x::Array<3, double, mgard_x::SERIAL> decompressed_array =
       mgard_x::decompress(hierarchy, compressed_array,
                           mgard_x::error_bound_type::REL, tol, s, norm, config);
   delete[] in_array_cpu;
diff --git a/examples/mgard-x/LowLevelAPIs/Serial/build_script.sh b/examples/mgard-x/LowLevelAPIs/SERIAL/build_script.sh
similarity index 100%
rename from examples/mgard-x/LowLevelAPIs/Serial/build_script.sh
rename to examples/mgard-x/LowLevelAPIs/SERIAL/build_script.sh
diff --git a/examples/mgard-x/MDR-X/README.md b/examples/mgard-x/MDR-X/README.md
index 0e9c917f60..c8143f98ff 100644
--- a/examples/mgard-x/MDR-X/README.md
+++ b/examples/mgard-x/MDR-X/README.md
@@ -1,7 +1,7 @@
 # Refactor and progressively reconstruct data with MDR-X
 
 First, build and install MGARD-X.
-Then, run the following in `examples/mgard-x/MDR-X/Serial`, `examples/mgard-x/MDR-X/CUDA`, `examples/mgard-x/MDR-X/HIP`. Each folder contains a CMake project dedicated for a different kind of processor.
+Then, run the following in `examples/mgard-x/MDR-X/SERIAL`, `examples/mgard-x/MDR-X/CUDA`, `examples/mgard-x/MDR-X/HIP`. Each folder contains a CMake project dedicated for a different kind of processor.
 
 Build with CMake as follows or use the 'build_scripts.sh'.
 ```console
diff --git a/examples/mgard-x/MDR-X/Serial/CMakeLists.txt b/examples/mgard-x/MDR-X/SERIAL/CMakeLists.txt
similarity index 100%
rename from examples/mgard-x/MDR-X/Serial/CMakeLists.txt
rename to examples/mgard-x/MDR-X/SERIAL/CMakeLists.txt
diff --git a/examples/mgard-x/MDR-X/Serial/build_script.sh b/examples/mgard-x/MDR-X/SERIAL/build_script.sh
similarity index 100%
rename from examples/mgard-x/MDR-X/Serial/build_script.sh
rename to examples/mgard-x/MDR-X/SERIAL/build_script.sh
diff --git a/examples/mgard-x/MDR-X/Serial/reconstructor.cpp b/examples/mgard-x/MDR-X/SERIAL/reconstructor.cpp
similarity index 99%
rename from examples/mgard-x/MDR-X/Serial/reconstructor.cpp
rename to examples/mgard-x/MDR-X/SERIAL/reconstructor.cpp
index fcee4c992d..d555505709 100644
--- a/examples/mgard-x/MDR-X/Serial/reconstructor.cpp
+++ b/examples/mgard-x/MDR-X/SERIAL/reconstructor.cpp
@@ -172,7 +172,7 @@ int main(int argc, char **argv) {
   using T = float;
   using T_stream = uint32_t;
   using T_error = double;
-  using DeviceType = mgard_x::Serial;
+  using DeviceType = mgard_x::SERIAL;
 
   const mgard_x::DIM D = 3;
   mgard_x::Hierarchy<D, T, DeviceType> hierarchy(dims, 0, num_levels - 1);
diff --git a/examples/mgard-x/MDR-X/Serial/refactor.cpp b/examples/mgard-x/MDR-X/SERIAL/refactor.cpp
similarity index 99%
rename from examples/mgard-x/MDR-X/Serial/refactor.cpp
rename to examples/mgard-x/MDR-X/SERIAL/refactor.cpp
index 9516d61607..6b46849a92 100644
--- a/examples/mgard-x/MDR-X/Serial/refactor.cpp
+++ b/examples/mgard-x/MDR-X/SERIAL/refactor.cpp
@@ -79,7 +79,7 @@ int main(int argc, char **argv) {
   using T = float;
   using T_stream = uint32_t;
   using T_error = double;
-  using DeviceType = mgard_x::Serial;
+  using DeviceType = mgard_x::SERIAL;
   if (num_bitplanes > 32) {
     num_bitplanes = 32;
     std::cout << "Only less than 32 bitplanes are supported for "
diff --git a/include/MGARDXConfig.h.in b/include/MGARDXConfig.h.in
index 86b13f586f..a403b002dc 100644
--- a/include/MGARDXConfig.h.in
+++ b/include/MGARDXConfig.h.in
@@ -26,16 +26,16 @@
 #define MGARD_ENABLE_HIP 0   
 #endif
 
+#if '@MGARD_ENABLE_SYCL@' == 'ON'
+#define MGARD_ENABLE_SYCL 1
+#else
+#define MGARD_ENABLE_SYCL 0   
+#endif
+
 #if '@MGARD_ENABLE_LEGACY_CUDA@' == 'ON'
 #define MGARD_ENABLE_LEGACY_CUDA 1
 #else
 #define MGARD_ENABLE_LEGACY_CUDA 0   
 #endif
 
-// #if '@MGARD_ENABLE_SYCL@' == 'ON'
-// #define MGARD_ENABLE_SYCL 1
-// #else
-// #define MGARD_ENABLE_SYCL 0   
-// #endif
-
 #endif
diff --git a/include/compress_x.hpp b/include/compress_x.hpp
index 1b4e9fae2d..1d2189cbb6 100644
--- a/include/compress_x.hpp
+++ b/include/compress_x.hpp
@@ -5,9 +5,9 @@
  * Date: March 17, 2022
  */
 
-#include "mgard-x/CompressionWorkflow.h"
+#include "MGARDXConfig.h"
 #include "mgard-x/RuntimeX/RuntimeXPublic.h"
-#include "mgard-x/Types.h"
+#include "mgard-x/Utilities/Types.h"
 #include <cstdint>
 
 #ifndef MGARD_X_API_H
diff --git a/include/compress_x_lowlevel.hpp b/include/compress_x_lowlevel.hpp
index 57ec7cd214..6b9507e3c0 100644
--- a/include/compress_x_lowlevel.hpp
+++ b/include/compress_x_lowlevel.hpp
@@ -5,8 +5,8 @@
  * Date: March 17, 2022
  */
 
-#include "mgard-x/CompressionWorkflow.hpp"
-#include "mgard-x/Hierarchy.hpp"
+#include "mgard-x/CompressionLowLevel/CompressionLowLevel.hpp"
+#include "mgard-x/Hierarchy/Hierarchy.hpp"
 #include "mgard-x/RuntimeX/DataStructures/Array.hpp"
 #include "mgard-x/RuntimeX/RuntimeX.h"
 
diff --git a/include/mgard-x/CMakeLists.txt b/include/mgard-x/CMakeLists.txt
index 753cc304d3..df47eb4dcc 100644
--- a/include/mgard-x/CMakeLists.txt
+++ b/include/mgard-x/CMakeLists.txt
@@ -1,16 +1,10 @@
 add_subdirectory(DataRefactoring)
+add_subdirectory(CompressionLowLevel)
+add_subdirectory(CompressionHighLevel)
+add_subdirectory(Hierarchy)
 add_subdirectory(Lossless)
 add_subdirectory(Quantization)
 add_subdirectory(RuntimeX)
-list(APPEND MGARD_X_HEADER
-    ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/Hierarchy.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/Hierarchy.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/Metadata.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/Types.h
-    )
+
 set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE)
 
diff --git a/include/mgard-x/CompressionHighLevel/CMakeLists.txt b/include/mgard-x/CompressionHighLevel/CMakeLists.txt
new file mode 100644
index 0000000000..58c83c1311
--- /dev/null
+++ b/include/mgard-x/CompressionHighLevel/CMakeLists.txt
@@ -0,0 +1,6 @@
+list(APPEND MGARD_X_HEADER
+    ${CMAKE_CURRENT_SOURCE_DIR}/CompressionHighLevel.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/CompressionHighLevel.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Metadata.hpp
+    )
+set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE)
\ No newline at end of file
diff --git a/include/mgard-x/HighLevelAPI.h b/include/mgard-x/CompressionHighLevel/CompressionHighLevel.h
similarity index 88%
rename from include/mgard-x/HighLevelAPI.h
rename to include/mgard-x/CompressionHighLevel/CompressionHighLevel.h
index 1082e76c4f..c56c283900 100644
--- a/include/mgard-x/HighLevelAPI.h
+++ b/include/mgard-x/CompressionHighLevel/CompressionHighLevel.h
@@ -11,13 +11,12 @@
 #include <numeric>
 #include <vector>
 
-// #include "compress_cuda.hpp"
-#include "mgard-x/Hierarchy.h"
-#include "mgard-x/Metadata.hpp"
-#include "mgard-x/RuntimeX/RuntimeXPublic.h"
+#include "../Hierarchy/Hierarchy.h"
+#include "../RuntimeX/RuntimeXPublic.h"
+#include "Metadata.hpp"
 
-#ifndef MGARD_X_HIGH_LEVEL_API_H
-#define MGARD_X_HIGH_LEVEL_API_H
+#ifndef MGARD_X_COMPRESSION_HIGH_LEVEL_API_H
+#define MGARD_X_COMPRESSION_HIGH_LEVEL_API_H
 
 namespace mgard_x {
 
@@ -66,10 +65,6 @@ void decompress(const void *compressed_data, size_t compressed_size,
                 void *&decompressed_data, data_type &dtype,
                 std::vector<mgard_x::SIZE> &shape, bool output_pre_allocated);
 
-template <typename DeviceType> void BeginAutoTuning();
-
-template <typename DeviceType> void EndAutoTuning();
-
 } // namespace mgard_x
 
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/HighLevelAPI.hpp b/include/mgard-x/CompressionHighLevel/CompressionHighLevel.hpp
similarity index 99%
rename from include/mgard-x/HighLevelAPI.hpp
rename to include/mgard-x/CompressionHighLevel/CompressionHighLevel.hpp
index 4369b04773..d88d2e3d8d 100644
--- a/include/mgard-x/HighLevelAPI.hpp
+++ b/include/mgard-x/CompressionHighLevel/CompressionHighLevel.hpp
@@ -11,15 +11,15 @@
 #include <numeric>
 #include <vector>
 
+#include "../Hierarchy/Hierarchy.hpp"
+#include "../RuntimeX/RuntimeX.h"
+#include "Metadata.hpp"
 #include "compress_x.hpp"
-#include "mgard-x/Hierarchy.hpp"
-#include "mgard-x/Metadata.hpp"
-#include "mgard-x/RuntimeX/RuntimeX.h"
 
-#include "Utilities/CheckEndianess.h"
+#include "../CompressionLowLevel/CompressionLowLevel.h"
 
-#ifndef MGARD_X_HIGH_LEVEL_API_HPP
-#define MGARD_X_HIGH_LEVEL_API_HPP
+#ifndef MGARD_X_COMPRESSION_HIGH_LEVEL_API_HPP
+#define MGARD_X_COMPRESSION_HIGH_LEVEL_API_HPP
 
 namespace mgard_x {
 
@@ -292,12 +292,14 @@ void compress(std::vector<SIZE> shape, T tol, T s, enum error_bound_type type,
   Hierarchy<D, T, DeviceType> hierarchy(shape, config.uniform_coord_mode);
 
   Metadata<DeviceType> m;
-  if (std::is_same<DeviceType, Serial>::value) {
-    m.ptype = processor_type::X_Serial;
+  if (std::is_same<DeviceType, SERIAL>::value) {
+    m.ptype = processor_type::X_SERIAL;
   } else if (std::is_same<DeviceType, CUDA>::value) {
     m.ptype = processor_type::X_CUDA;
   } else if (std::is_same<DeviceType, HIP>::value) {
     m.ptype = processor_type::X_HIP;
+  } else if (std::is_same<DeviceType, SYCL>::value) {
+    m.ptype = processor_type::X_SYCL;
   }
   m.ebtype = type;
   m.tol = tol;
@@ -325,7 +327,6 @@ void compress(std::vector<SIZE> shape, T tol, T s, enum error_bound_type type,
 #endif
   m.dtype =
       std::is_same<T, double>::value ? data_type::Double : data_type::Float;
-  m.etype = CheckEndianess();
   m.dstype = data_structure_type::Cartesian_Grid_Uniform;
   m.total_dims = D;
   m.shape = std::vector<uint64_t>(D);
@@ -521,12 +522,14 @@ void compress(std::vector<SIZE> shape, T tol, T s, enum error_bound_type type,
   Hierarchy<D, T, DeviceType> hierarchy(shape, coords);
 
   Metadata<DeviceType> m;
-  if (std::is_same<DeviceType, Serial>::value) {
-    m.ptype = processor_type::X_Serial;
+  if (std::is_same<DeviceType, SERIAL>::value) {
+    m.ptype = processor_type::X_SERIAL;
   } else if (std::is_same<DeviceType, CUDA>::value) {
     m.ptype = processor_type::X_CUDA;
   } else if (std::is_same<DeviceType, HIP>::value) {
     m.ptype = processor_type::X_HIP;
+  } else if (std::is_same<DeviceType, SYCL>::value) {
+    m.ptype = processor_type::X_SYCL;
   }
   m.ebtype = type;
   m.tol = tol;
@@ -554,7 +557,6 @@ void compress(std::vector<SIZE> shape, T tol, T s, enum error_bound_type type,
 #endif
   m.dtype =
       std::is_same<T, double>::value ? data_type::Double : data_type::Float;
-  m.etype = CheckEndianess();
   m.dstype = data_structure_type::Cartesian_Grid_Non_Uniform;
   m.total_dims = D;
   m.shape = std::vector<uint64_t>(D);
@@ -1604,14 +1606,6 @@ void decompress(const void *compressed_data, size_t compressed_size,
                          dtype, shape, config, output_pre_allocated);
 }
 
-template <typename DeviceType> void BeginAutoTuning() {
-  AutoTuner<DeviceType>::ProfileKernels = true;
-}
-
-template <typename DeviceType> void EndAutoTuning() {
-  AutoTuner<DeviceType>::ProfileKernels = false;
-}
-
 } // namespace mgard_x
 
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/Metadata.hpp b/include/mgard-x/CompressionHighLevel/Metadata.hpp
similarity index 98%
rename from include/mgard-x/Metadata.hpp
rename to include/mgard-x/CompressionHighLevel/Metadata.hpp
index b7a61c3c1e..a0111629ce 100644
--- a/include/mgard-x/Metadata.hpp
+++ b/include/mgard-x/CompressionHighLevel/Metadata.hpp
@@ -5,9 +5,9 @@
  * Date: March 17, 2022
  */
 
+#include "../RuntimeX/RuntimeX.h"
+#include "../Utilities/Types.h"
 #include "MGARDConfig.hpp"
-#include "RuntimeX/RuntimeX.h"
-#include "Types.h"
 #include "format.hpp"
 #include "proto/mgard.pb.h"
 #include <cstring>
@@ -160,8 +160,8 @@ template <typename DeviceType> struct Metadata {
     }
 
     std::cout << "Backend:  ";
-    if (ptype == processor_type::X_Serial) {
-      std::cout << "X_Serial\n";
+    if (ptype == processor_type::X_SERIAL) {
+      std::cout << "X_SERIAL\n";
     } else if (ptype == processor_type::X_CUDA) {
       std::cout << "X_CUDA\n";
     } else if (ptype == processor_type::X_HIP) {
@@ -173,7 +173,11 @@ template <typename DeviceType> struct Metadata {
 
 private:
   SERIALIZED_TYPE *SerializeAll(uint32_t &total_size) {
-
+    if (big_endian<std::int64_t>()) {
+      etype = endiness_type::Big_Endian;
+    } else {
+      etype = endiness_type::Little_Endian;
+    }
     total_size = 0;
 
     // about MGARD software
@@ -500,7 +504,7 @@ template <typename DeviceType> struct Metadata {
 
     { // Device
       mgard::pb::Device &device = *header.mutable_device();
-      if (ptype == processor_type::X_Serial) {
+      if (ptype == processor_type::X_SERIAL) {
         device.set_backend(mgard::pb::Device::X_SERIAL);
       } else if (ptype == processor_type::X_CUDA) {
         device.set_backend(mgard::pb::Device::X_CUDA);
@@ -746,7 +750,7 @@ template <typename DeviceType> struct Metadata {
     { // Device
       const mgard::pb::Device device = header.device();
       if (device.backend() == mgard::pb::Device::X_SERIAL) {
-        ptype = processor_type::X_Serial;
+        ptype = processor_type::X_SERIAL;
       } else if (device.backend() == mgard::pb::Device::X_CUDA) {
         ptype = processor_type::X_CUDA;
       } else if (device.backend() == mgard::pb::Device::X_HIP) {
diff --git a/include/mgard-x/CompressionLowLevel/CMakeLists.txt b/include/mgard-x/CompressionLowLevel/CMakeLists.txt
new file mode 100644
index 0000000000..7dcac948d7
--- /dev/null
+++ b/include/mgard-x/CompressionLowLevel/CMakeLists.txt
@@ -0,0 +1,5 @@
+list(APPEND MGARD_X_HEADER
+    ${CMAKE_CURRENT_SOURCE_DIR}/CompressionLowLevel.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/CompressionLowLevel.hpp
+    )
+set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE)
\ No newline at end of file
diff --git a/include/mgard-x/CompressionWorkflow.h b/include/mgard-x/CompressionLowLevel/CompressionLowLevel.h
similarity index 83%
rename from include/mgard-x/CompressionWorkflow.h
rename to include/mgard-x/CompressionLowLevel/CompressionLowLevel.h
index 0ee4e5e342..a45c89b551 100644
--- a/include/mgard-x/CompressionWorkflow.h
+++ b/include/mgard-x/CompressionLowLevel/CompressionLowLevel.h
@@ -5,11 +5,11 @@
  * Date: March 17, 2022
  */
 
-#ifndef MGARD_X_COMPRESSION_WORKFLOW_H
-#define MGARD_X_COMPRESSION_WORKFLOW_H
+#ifndef MGARD_X_COMPRESSION_LOW_LEVEL_H
+#define MGARD_X_COMPRESSION_LOW_LEVEL_H
 
-#include "Hierarchy.h"
-#include "RuntimeX/RuntimeXPublic.h"
+#include "../Hierarchy/Hierarchy.hpp"
+#include "../RuntimeX/RuntimeXPublic.h"
 
 namespace mgard_x {
 
diff --git a/include/mgard-x/CompressionWorkflow.hpp b/include/mgard-x/CompressionLowLevel/CompressionLowLevel.hpp
similarity index 91%
rename from include/mgard-x/CompressionWorkflow.hpp
rename to include/mgard-x/CompressionLowLevel/CompressionLowLevel.hpp
index edc3ffa8a0..47062970f7 100644
--- a/include/mgard-x/CompressionWorkflow.hpp
+++ b/include/mgard-x/CompressionLowLevel/CompressionLowLevel.hpp
@@ -11,38 +11,37 @@
 #include <numeric>
 #include <vector>
 
-#include "Types.h"
+#include "../Utilities/Types.h"
 
-#include "CompressionWorkflow.h"
-#include "Hierarchy.hpp"
-#include "RuntimeX/RuntimeX.h"
+#include "../Hierarchy/Hierarchy.hpp"
+#include "../RuntimeX/RuntimeX.h"
+#include "CompressionLowLevel.h"
 
-#include "DataRefactoring/MultiDimension/DataRefactoring.h"
-#include "DataRefactoring/SingleDimension/DataRefactoring.h"
+#include "../DataRefactoring/MultiDimension/DataRefactoring.h"
+#include "../DataRefactoring/SingleDimension/DataRefactoring.h"
 
-#include "Quantization/LinearQuantization.hpp"
+#include "../Quantization/LinearQuantization.hpp"
 
 // #include "Linearization/LevelLinearizer.hpp"
-#include "Linearization/LevelLinearizer2.hpp"
+#include "../Linearization/LevelLinearizer2.hpp"
 
-#include "Lossless/ParallelHuffman/Huffman.hpp"
+#include "../Lossless/ParallelHuffman/Huffman.hpp"
 
 #ifdef MGARDX_COMPILE_CUDA
-#include "Lossless/Cascaded.hpp"
-#include "Lossless/LZ4.hpp"
+#include "../Lossless/Cascaded.hpp"
+#include "../Lossless/LZ4.hpp"
 #endif
 
-#include "Lossless/CPU.hpp"
-#include "Lossless/Zstd.hpp"
-#include "Utilities/CheckEndianess.h"
+#include "../Lossless/CPU.hpp"
+#include "../Lossless/Zstd.hpp"
 
 // for debugging
 // #include "../cuda/CommonInternal.h"
 // #include "../cuda/DataRefactoring.h"
 // #include "../cuda/SubArray.h"
 
-#ifndef MGARD_X_COMPRESSION_WORKFLOW_HPP
-#define MGARD_X_COMPRESSION_WORKFLOW_HPP
+#ifndef MGARD_X_COMPRESSION_LOW_LEVEL_HPP
+#define MGARD_X_COMPRESSION_LOW_LEVEL_HPP
 
 #define BLOCK_SIZE 64
 
@@ -58,6 +57,11 @@ compress(Hierarchy<D, T, DeviceType> &hierarchy,
          Array<D, T, DeviceType> &in_array, enum error_bound_type type, T tol,
          T s, T &norm, Config config) {
   DeviceRuntime<DeviceType>::SelectDevice(config.dev_id);
+  if (config.timing) {
+    std::cout << log::log_info
+              << "Select device: " << DeviceRuntime<DeviceType>::GetDeviceName()
+              << "\n";
+  }
   Timer timer_total, timer_each;
   for (DIM i = 0; i < D; i++) {
     if (hierarchy.shape[i] != in_array.shape()[i]) {
@@ -193,6 +197,8 @@ compress(Hierarchy<D, T, DeviceType> &hierarchy,
   SubArray<1, LENGTH, DeviceType> outlier_idx_subarray(outlier_idx_array);
   SubArray<1, QUANTIZED_INT, DeviceType> outliers_subarray(outliers_array);
 
+  DeviceRuntime<DeviceType>::SyncQueue(0);
+
   LevelwiseLinearQuantizeND<D, T, DeviceType>().Execute(
       SubArray<1, SIZE, DeviceType>(hierarchy.ranges), hierarchy.l_target,
       quantizers_subarray, SubArray<2, T, DeviceType>(hierarchy.volumes_array),
@@ -201,11 +207,12 @@ compress(Hierarchy<D, T, DeviceType> &hierarchy,
       SubArray<1, SIZE, DeviceType>(hierarchy.shapes[0], true),
       SubArray<1, LENGTH, DeviceType>(outlier_count_array),
       outlier_idx_subarray, outliers_subarray, 0);
+
   MemoryManager<DeviceType>::Copy1D(&outlier_count, outlier_count_array.data(),
                                     1, 0);
-  DeviceRuntime<DeviceType>::SyncDevice();
-  // m.huff_outlier_count = outlier_count;
+
   if (config.timing) {
+    DeviceRuntime<DeviceType>::SyncQueue(0);
     timer_each.end();
     timer_each.print("Quantization");
     timer_each.clear();
@@ -213,15 +220,12 @@ compress(Hierarchy<D, T, DeviceType> &hierarchy,
               << total_elems << " ("
               << (double)100 * outlier_count / total_elems << "%)\n";
   }
-  if (debug_print) {
-    // PrintSubarray("decomposed", SubArray<D, T, DeviceType>(in_array));
-    // PrintSubarray("signed_quanzited_array", SubArray<D, QUANTIZED_INT,
-    // DeviceType>(signed_quanzited_array)); std::cout << "outlier_count: " <<
-    // outlier_count << std::endl; PrintSubarray("quantized outliers_array",
-    // SubArray<1, QUANTIZED_INT, DeviceType>(outliers_array));
-    // PrintSubarray("quantized outlier_idx_array", SubArray<1, LENGTH,
-    // DeviceType>(outlier_idx_array));
-  }
+  // if (debug_print) {
+  // PrintSubarray("decomposed", SubArray(in_array));
+  // PrintSubarray("quantized_subarray", quantized_subarray);
+  // PrintSubarray("quantized outliers_array", outliers_subarray);
+  // PrintSubarray("quantized outlier_idx_array", outlier_idx_subarray);
+  // }
 
   Array<1, Byte, DeviceType> lossless_compressed_array;
   SubArray<1, Byte, DeviceType> lossless_compressed_subarray;
@@ -247,6 +251,7 @@ compress(Hierarchy<D, T, DeviceType> &hierarchy,
     }
     DeviceRuntime<DeviceType>::SyncDevice();
     if (config.timing) {
+      DeviceRuntime<DeviceType>::SyncQueue(0);
       timer_each.end();
       timer_each.print("Level Linearizer type: " +
                        std::to_string(config.reorder));
@@ -278,6 +283,7 @@ compress(Hierarchy<D, T, DeviceType> &hierarchy,
             outliers_subarray);
     lossless_compressed_subarray = SubArray(lossless_compressed_array);
     if (config.timing) {
+      DeviceRuntime<DeviceType>::SyncQueue(0);
       timer_each.end();
       timer_each.print("Huffman Compress");
       std::cout << log::log_info
@@ -301,6 +307,7 @@ compress(Hierarchy<D, T, DeviceType> &hierarchy,
         CPUCompress<QUANTIZED_INT, DeviceType>(quantized_linearized_subarray);
     lossless_compressed_subarray = SubArray(lossless_compressed_array);
     if (config.timing) {
+      DeviceRuntime<DeviceType>::SyncQueue(0);
       timer_each.end();
       timer_each.print("CPU Lossless");
       std::cout << log::log_info << "CPU Lossless compress ratio: "
@@ -327,6 +334,7 @@ compress(Hierarchy<D, T, DeviceType> &hierarchy,
     lossless_compressed_subarray = SubArray(lossless_compressed_array);
     SIZE lz4_after_size = lossless_compressed_subarray.getShape(0);
     if (config.timing) {
+      DeviceRuntime<DeviceType>::SyncQueue(0);
       timer_each.end();
       timer_each.print("LZ4 Compress");
       std::cout << log::log_info << "LZ4 block size: " << config.lz4_block_size
@@ -359,6 +367,7 @@ compress(Hierarchy<D, T, DeviceType> &hierarchy,
   }
 
   if (config.timing) {
+    DeviceRuntime<DeviceType>::SyncQueue(0);
     timer_total.end();
     timer_total.print("Overall Compress");
     std::cout << log::log_time << "Compression Throughput: "
@@ -376,6 +385,11 @@ decompress(Hierarchy<D, T, DeviceType> &hierarchy,
            Array<1, unsigned char, DeviceType> &compressed_array,
            enum error_bound_type type, T tol, T s, T norm, Config config) {
   DeviceRuntime<DeviceType>::SelectDevice(config.dev_id);
+  if (config.timing) {
+    std::cout << log::log_info
+              << "Select device: " << DeviceRuntime<DeviceType>::GetDeviceName()
+              << "\n";
+  }
   Timer timer_total, timer_each;
 
   SIZE total_elems =
@@ -404,8 +418,8 @@ decompress(Hierarchy<D, T, DeviceType> &hierarchy,
     lossless_compressed_array =
         LZ4Decompress<Byte, DeviceType>(lossless_compressed_subarray);
     lossless_compressed_subarray = SubArray(lossless_compressed_array);
-    DeviceRuntime<DeviceType>::SyncDevice();
     if (config.timing) {
+      DeviceRuntime<DeviceType>::SyncQueue(0);
       timer_each.end();
       timer_each.print("LZ4 Decompress");
       timer_each.clear();
@@ -424,8 +438,8 @@ decompress(Hierarchy<D, T, DeviceType> &hierarchy,
     lossless_compressed_array =
         ZstdDecompress<Byte, DeviceType>(lossless_compressed_subarray);
     lossless_compressed_subarray = SubArray(lossless_compressed_array);
-    DeviceRuntime<DeviceType>::SyncDevice();
     if (config.timing) {
+      DeviceRuntime<DeviceType>::SyncQueue(0);
       timer_each.end();
       timer_each.print("Zstd Decompress");
       timer_each.clear();
@@ -470,8 +484,8 @@ decompress(Hierarchy<D, T, DeviceType> &hierarchy,
       } else {
         std::cout << log::log_err << "wrong reodering option.\n";
       }
-      DeviceRuntime<DeviceType>::SyncDevice();
       if (config.timing) {
+        DeviceRuntime<DeviceType>::SyncQueue(0);
         timer_each.end();
         timer_each.print("Level Linearizer type: " +
                          std::to_string(config.reorder));
@@ -484,8 +498,8 @@ decompress(Hierarchy<D, T, DeviceType> &hierarchy,
           total_elems, 0);
     }
 
-    DeviceRuntime<DeviceType>::SyncDevice();
     if (config.timing) {
+      DeviceRuntime<DeviceType>::SyncQueue(0);
       timer_each.end();
       timer_each.print("Huffman Decompress");
       timer_each.clear();
@@ -509,8 +523,8 @@ decompress(Hierarchy<D, T, DeviceType> &hierarchy,
       } else {
         std::cout << log::log_err << "wrong reodering type.\n";
       }
-      DeviceRuntime<DeviceType>::SyncDevice();
       if (config.timing) {
+        DeviceRuntime<DeviceType>::SyncQueue(0);
         timer_each.end();
         timer_each.print("Level Linearizer type: " +
                          std::to_string(config.reorder));
@@ -524,6 +538,7 @@ decompress(Hierarchy<D, T, DeviceType> &hierarchy,
           (QUANTIZED_INT *)quantized_linearized_array.data(), total_elems, 0);
     }
     if (config.timing) {
+      DeviceRuntime<DeviceType>::SyncQueue(0);
       timer_each.end();
       timer_each.print("CPU Lossless");
       timer_each.clear();
@@ -565,10 +580,8 @@ decompress(Hierarchy<D, T, DeviceType> &hierarchy,
       SubArray<1, SIZE, DeviceType>(hierarchy.shapes[0], true), outlier_count,
       outlier_idx_subarray, outliers_subarray, 0);
 
-  DeviceRuntime<DeviceType>::SyncDevice();
-
-  // hierarchy.sync_all();
   if (config.timing) {
+    DeviceRuntime<DeviceType>::SyncQueue(0);
     timer_each.end();
     timer_each.print("Dequantization");
     timer_each.clear();
@@ -588,15 +601,16 @@ decompress(Hierarchy<D, T, DeviceType> &hierarchy,
     recompose_single<D, T, DeviceType>(hierarchy, decompressed_subarray,
                                        hierarchy.l_target, 0);
   }
-  // hierarchy.sync_all();
+
   if (config.timing) {
+    DeviceRuntime<DeviceType>::SyncQueue(0);
     timer_each.end();
     timer_each.print("Recomposition");
     timer_each.clear();
   }
 
-  // hierarchy.sync_all();
   if (config.timing) {
+    DeviceRuntime<DeviceType>::SyncQueue(0);
     timer_total.end();
     timer_total.print("Overall Decompression");
     std::cout << log::log_time << "Decompression Throughput: "
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt
index 5b73c54270..ba5d9ebe59 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt
@@ -1,8 +1,6 @@
 list(APPEND MGARD_X_HEADER
     ${CMAKE_CURRENT_SOURCE_DIR}/GPKFunctor.h
     ${CMAKE_CURRENT_SOURCE_DIR}/GridProcessingKernel.hpp   
-    ${CMAKE_CURRENT_SOURCE_DIR}/GridProcessingKernel.h
     ${CMAKE_CURRENT_SOURCE_DIR}/GridProcessingKernel3D.hpp   
-    ${CMAKE_CURRENT_SOURCE_DIR}/GridProcessingKernel3D.h
     )
 set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE)
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.hpp
new file mode 100644
index 0000000000..55105682eb
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "GridProcessingKernel3D.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS_3D
+#define MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS_3D
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCoefficients3D(Hierarchy<D, T, DeviceType> &hierarchy,
+                        SubArray<D, T, DeviceType> dinput,
+                        SubArray<D, T, DeviceType> &doutput, SIZE l,
+                        int queue_idx) {
+
+  int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1);
+  int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1);
+
+  std::string prefix = "decomp_";
+  if (sizeof(T) == sizeof(double))
+    prefix += "d_";
+  if (sizeof(T) == sizeof(float))
+    prefix += "f_";
+  for (int d = 0; d < D; d++)
+    prefix += std::to_string(hierarchy.shape[d]) + "_";
+
+  dinput.project(0, 1, 2);
+  doutput.project(0, 1, 2);
+
+  SIZE f = hierarchy.dofs[0][l];
+  SIZE c = hierarchy.dofs[1][l];
+  SIZE r = hierarchy.dofs[2][l];
+  SIZE ff = hierarchy.dofs[0][l + 1];
+  SIZE cc = hierarchy.dofs[1][l + 1];
+  SIZE rr = hierarchy.dofs[2][l + 1];
+
+  SubArray<D, T, DeviceType> dcoarse = doutput;
+  dcoarse.resize({ff, cc, rr});
+  SubArray<D, T, DeviceType> dcoeff_f = doutput;
+  dcoeff_f.offset({ff, 0, 0});
+  dcoeff_f.resize({f - ff, cc, rr});
+  SubArray<D, T, DeviceType> dcoeff_c = doutput;
+  dcoeff_c.offset({0, cc, 0});
+  dcoeff_c.resize({ff, c - cc, rr});
+  SubArray<D, T, DeviceType> dcoeff_r = doutput;
+  dcoeff_r.offset({0, 0, rr});
+  dcoeff_r.resize({ff, cc, r - rr});
+  SubArray<D, T, DeviceType> dcoeff_cf = doutput;
+  dcoeff_cf.offset({ff, cc, 0});
+  dcoeff_cf.resize({f - ff, c - cc, rr});
+  SubArray<D, T, DeviceType> dcoeff_rf = doutput;
+  dcoeff_rf.offset({ff, 0, rr});
+  dcoeff_rf.resize({f - ff, cc, r - rr});
+  SubArray<D, T, DeviceType> dcoeff_rc = doutput;
+  dcoeff_rc.offset({0, cc, rr});
+  dcoeff_rc.resize({ff, c - cc, r - rr});
+  SubArray<D, T, DeviceType> dcoeff_rcf = doutput;
+  dcoeff_rcf.offset({ff, cc, rr});
+  dcoeff_rcf.resize({f - ff, c - cc, r - rr});
+
+  GpkReo3D<D, T, DeviceType>().Execute(
+      hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l],
+      hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
+      hierarchy.dofs[0][l + 1], SubArray(hierarchy.ratio_array[2][l]),
+      SubArray(hierarchy.ratio_array[1][l]),
+      SubArray(hierarchy.ratio_array[0][l]), dinput, dcoarse, dcoeff_f,
+      dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf,
+      queue_idx);
+
+  verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l],
+                     hierarchy.dofs[0][l], doutput.data(), doutput.getLd(0),
+                     doutput.getLd(1), doutput.getLd(0),
+                     prefix + "gpk_reo_3d" + "_level_" + std::to_string(l),
+                     multidim_refactoring_store, multidim_refactoring_verify);
+
+  if (multidim_refactoring_debug_print) {
+    PrintSubarray("after pi_Ql_reo", doutput);
+  }
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.hpp
new file mode 100644
index 0000000000..6363c83670
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.hpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "CalcCoefficientsPointers.hpp"
+#include "GridProcessingKernel.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS_ND
+#define MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS_ND
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCoefficientsND(Hierarchy<D, T, DeviceType> &hierarchy,
+                        SubArray<D, T, DeviceType> dinput1,
+                        SubArray<D, T, DeviceType> dinput2,
+                        SubArray<D, T, DeviceType> &doutput, SIZE l,
+                        int queue_idx) {
+
+  std::string prefix = "decomp_";
+  if (sizeof(T) == sizeof(double))
+    prefix += "d_";
+  if (sizeof(T) == sizeof(float))
+    prefix += "f_";
+  for (int d = 0; d < D; d++)
+    prefix += std::to_string(hierarchy.shape[d]) + "_";
+  // printf("interpolate 1-3D\n");
+
+  SubArray<D, T, DeviceType> dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf,
+      dcoeff_rf, dcoeff_rc, dcoeff_rcf;
+
+  DIM curr_dims[3];
+
+  int unprocessed_idx = 0;
+  curr_dims[0] = 0;
+  curr_dims[1] = 1;
+  curr_dims[2] = 2;
+  dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+  doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+
+  CalcCoefficientsPointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f,
+                           dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+                           dcoeff_rcf);
+
+  GpkReo<D, 3, T, true, false, 1, DeviceType>().Execute(
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+      hierarchy.unprocessed_n[unprocessed_idx],
+      SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+      curr_dims[1], curr_dims[0],
+      SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+      SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+      SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
+      dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf,
+      queue_idx);
+
+  for (DIM d = 3; d < D; d += 2) {
+    // copy back to input1 for interpolation again
+    // LwpkReo<D, T, COPY, DeviceType>().Execute(doutput, dinput1, queue_idx);
+    CopyND(doutput, dinput1, queue_idx);
+
+    // printf("interpolate %u-%uD\n", d+1, d+2);
+    curr_dims[0] = 0;
+    curr_dims[1] = d;
+    curr_dims[2] = d + 1;
+    dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+    doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+    CalcCoefficientsPointers(hierarchy, curr_dims, l, doutput, dcoarse,
+                             dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf,
+                             dcoeff_rc, dcoeff_rcf);
+
+    if (D - d == 1) {
+      unprocessed_idx += 1;
+
+      GpkReo<D, 2, T, true, false, 2, DeviceType>().Execute(
+          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+          hierarchy.unprocessed_n[unprocessed_idx],
+          SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+          curr_dims[1], curr_dims[0],
+          SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+          SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+          SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
+          dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+          dcoeff_rcf, queue_idx);
+
+    } else { // D - d >= 2
+      unprocessed_idx += 2;
+      GpkReo<D, 3, T, true, false, 2, DeviceType>().Execute(
+          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+          hierarchy.unprocessed_n[unprocessed_idx],
+          SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
+          // unprocessed_dims_subarray,
+          curr_dims[2], curr_dims[1], curr_dims[0],
+          // ratio_r, ratio_c, ratio_f,
+          SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+          SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+          SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
+          dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+          dcoeff_rcf, queue_idx);
+    }
+  }
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D("after interpolation", doutput);
+  } // debug
+
+  unprocessed_idx = 0;
+  // printf("reorder 1-3D\n");
+  curr_dims[0] = 0;
+  curr_dims[1] = 1;
+  curr_dims[2] = 2;
+  dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+  dinput1.project(curr_dims[0], curr_dims[1],
+                  curr_dims[2]); // reuse input1 as temp output
+
+  CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f,
+                           dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+                           dcoeff_rcf);
+
+  GpkReo<D, 3, T, false, false, 1, DeviceType>().Execute(
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+      hierarchy.unprocessed_n[unprocessed_idx],
+      SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+      curr_dims[1], curr_dims[0],
+      SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+      SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+      SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput2, dcoarse,
+      dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf,
+      queue_idx);
+
+  DIM D_reduced = D % 2 == 0 ? D - 1 : D - 2;
+  for (DIM d = 3; d < D_reduced; d += 2) {
+    // copy back to input2 for reordering again
+
+    // LwpkReo<D, T, COPY, DeviceType>().Execute(dinput1, dinput2, queue_idx);
+    CopyND(dinput1, dinput2, queue_idx);
+
+    unprocessed_idx += 2;
+    // printf("reorder %u-%uD\n", d+1, d+2);
+    curr_dims[0] = 0;
+    curr_dims[1] = d;
+    curr_dims[2] = d + 1;
+    dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+    dinput1.project(curr_dims[0], curr_dims[1],
+                    curr_dims[2]); // reuse input1 as temp output
+
+    CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse,
+                             dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf,
+                             dcoeff_rc, dcoeff_rcf);
+
+    GpkReo<D, 3, T, false, false, 2, DeviceType>().Execute(
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+        hierarchy.unprocessed_n[unprocessed_idx],
+        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+        curr_dims[1], curr_dims[0],
+        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput2, dcoarse,
+        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+        dcoeff_rcf, queue_idx);
+  }
+
+  // printf("calc coeff %u-%dD\n", D_reduced+1, D_reduced+2);
+  curr_dims[0] = 0;
+  curr_dims[1] = D_reduced;
+  curr_dims[2] = D_reduced + 1;
+  dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+  doutput.project(curr_dims[0], curr_dims[1],
+                  curr_dims[2]); // reuse input1 as temp output
+  CalcCoefficientsPointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f,
+                           dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+                           dcoeff_rcf);
+  if (D - D_reduced == 1) {
+    unprocessed_idx += 1;
+    GpkReo<D, 2, T, false, true, 2, DeviceType>().Execute(
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+        hierarchy.unprocessed_n[unprocessed_idx],
+        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+        curr_dims[1], curr_dims[0],
+        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
+        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+        dcoeff_rcf, queue_idx);
+
+  } else { // D-D_reduced == 2
+    unprocessed_idx += 2;
+
+    GpkReo<D, 3, T, false, true, 2, DeviceType>().Execute(
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+        hierarchy.unprocessed_n[unprocessed_idx],
+        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+        curr_dims[1], curr_dims[0],
+        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
+        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+        dcoeff_rcf, queue_idx);
+  }
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D("after calc coeff", doutput);
+  } // debug
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsPointers.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsPointers.hpp
new file mode 100644
index 0000000000..e0d1948d5f
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsPointers.hpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#ifndef MGARD_X_DATA_REFACTORING_CALC_COEFFICIENT_POINTERS
+#define MGARD_X_DATA_REFACTORING_CALC_COEFFICIENT_POINTERS
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCoefficientsPointers(
+    Hierarchy<D, T, DeviceType> &hierarchy, DIM curr_dims[3], DIM l,
+    SubArray<D, T, DeviceType> doutput, SubArray<D, T, DeviceType> &dcoarse,
+    SubArray<D, T, DeviceType> &dcoeff_f, SubArray<D, T, DeviceType> &dcoeff_c,
+    SubArray<D, T, DeviceType> &dcoeff_r, SubArray<D, T, DeviceType> &dcoeff_cf,
+    SubArray<D, T, DeviceType> &dcoeff_rf,
+    SubArray<D, T, DeviceType> &dcoeff_rc,
+    SubArray<D, T, DeviceType> &dcoeff_rcf) {
+
+  SIZE n[3];
+  SIZE nn[3];
+  for (DIM d = 0; d < 3; d++) {
+    n[d] = hierarchy.dofs[curr_dims[d]][l];
+    nn[d] = hierarchy.dofs[curr_dims[d]][l + 1];
+  }
+
+  dcoarse = doutput;
+  dcoarse.resize(curr_dims[0], nn[0]);
+  dcoarse.resize(curr_dims[1], nn[1]);
+  dcoarse.resize(curr_dims[2], nn[2]);
+
+  dcoeff_f = doutput;
+  dcoeff_f.offset(curr_dims[0], nn[0]);
+  dcoeff_f.resize(curr_dims[0], n[0] - nn[0]);
+  dcoeff_f.resize(curr_dims[1], nn[1]);
+  dcoeff_f.resize(curr_dims[2], nn[2]);
+
+  dcoeff_c = doutput;
+  dcoeff_c.offset(curr_dims[1], nn[1]);
+  dcoeff_c.resize(curr_dims[0], nn[0]);
+  dcoeff_c.resize(curr_dims[1], n[1] - nn[1]);
+  dcoeff_c.resize(curr_dims[2], nn[2]);
+
+  dcoeff_r = doutput;
+  dcoeff_r.offset(curr_dims[2], nn[2]);
+  dcoeff_r.resize(curr_dims[0], nn[0]);
+  dcoeff_r.resize(curr_dims[1], nn[1]);
+  dcoeff_r.resize(curr_dims[2], n[2] - nn[2]);
+
+  dcoeff_cf = doutput;
+  dcoeff_cf.offset(curr_dims[0], nn[0]);
+  dcoeff_cf.offset(curr_dims[1], nn[1]);
+  dcoeff_cf.resize(curr_dims[0], n[0] - nn[0]);
+  dcoeff_cf.resize(curr_dims[1], n[1] - nn[1]);
+  dcoeff_cf.resize(curr_dims[2], nn[2]);
+
+  dcoeff_rf = doutput;
+  dcoeff_rf.offset(curr_dims[0], nn[0]);
+  dcoeff_rf.offset(curr_dims[2], nn[2]);
+  dcoeff_rf.resize(curr_dims[0], n[0] - nn[0]);
+  dcoeff_rf.resize(curr_dims[1], nn[1]);
+  dcoeff_rf.resize(curr_dims[2], n[2] - nn[2]);
+
+  dcoeff_rc = doutput;
+  dcoeff_rc.offset(curr_dims[1], nn[1]);
+  dcoeff_rc.offset(curr_dims[2], nn[2]);
+  dcoeff_rc.resize(curr_dims[0], nn[0]);
+  dcoeff_rc.resize(curr_dims[1], n[1] - nn[1]);
+  dcoeff_rc.resize(curr_dims[2], n[2] - nn[2]);
+
+  dcoeff_rcf = doutput;
+  dcoeff_rcf.offset(curr_dims[0], nn[0]);
+  dcoeff_rcf.offset(curr_dims[1], nn[1]);
+  dcoeff_rcf.offset(curr_dims[2], nn[2]);
+  dcoeff_rcf.resize(curr_dims[0], n[0] - nn[0]);
+  dcoeff_rcf.resize(curr_dims[1], n[1] - nn[1]);
+  dcoeff_rcf.resize(curr_dims[2], n[2] - nn[2]);
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.hpp
new file mode 100644
index 0000000000..ec1cbebadf
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "GridProcessingKernel3D.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE_3D
+#define MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE_3D
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CoefficientsRestore3D(Hierarchy<D, T, DeviceType> &hierarchy,
+                           SubArray<D, T, DeviceType> dinput,
+                           SubArray<D, T, DeviceType> &doutput, SIZE l,
+                           int queue_idx) {
+
+  int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1);
+  int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1);
+
+  std::string prefix = "decomp_";
+  if (sizeof(T) == sizeof(double))
+    prefix += "d_";
+  if (sizeof(T) == sizeof(float))
+    prefix += "f_";
+  for (int d = 0; d < D; d++)
+    prefix += std::to_string(hierarchy.shape[d]) + "_";
+
+  dinput.project(0, 1, 2);
+  doutput.project(0, 1, 2);
+
+  SIZE f = hierarchy.dofs[0][l];
+  SIZE c = hierarchy.dofs[1][l];
+  SIZE r = hierarchy.dofs[2][l];
+  SIZE ff = hierarchy.dofs[0][l + 1];
+  SIZE cc = hierarchy.dofs[1][l + 1];
+  SIZE rr = hierarchy.dofs[2][l + 1];
+
+  SubArray<D, T, DeviceType> dcoarse = dinput;
+  dcoarse.resize({ff, cc, rr});
+  SubArray<D, T, DeviceType> dcoeff_f = dinput;
+  dcoeff_f.offset({ff, 0, 0});
+  dcoeff_f.resize({f - ff, cc, rr});
+  SubArray<D, T, DeviceType> dcoeff_c = dinput;
+  dcoeff_c.offset({0, cc, 0});
+  dcoeff_c.resize({ff, c - cc, rr});
+  SubArray<D, T, DeviceType> dcoeff_r = dinput;
+  dcoeff_r.offset({0, 0, rr});
+  dcoeff_r.resize({ff, cc, r - rr});
+  SubArray<D, T, DeviceType> dcoeff_cf = dinput;
+  dcoeff_cf.offset({ff, cc, 0});
+  dcoeff_cf.resize({f - ff, c - cc, rr});
+  SubArray<D, T, DeviceType> dcoeff_rf = dinput;
+  dcoeff_rf.offset({ff, 0, rr});
+  dcoeff_rf.resize({f - ff, cc, r - rr});
+  SubArray<D, T, DeviceType> dcoeff_rc = dinput;
+  dcoeff_rc.offset({0, cc, rr});
+  dcoeff_rc.resize({ff, c - cc, r - rr});
+  SubArray<D, T, DeviceType> dcoeff_rcf = dinput;
+  dcoeff_rcf.offset({ff, cc, rr});
+  dcoeff_rcf.resize({f - ff, c - cc, r - rr});
+
+  GpkRev3D<D, T, DeviceType>().Execute(
+      hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l],
+      hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
+      hierarchy.dofs[0][l + 1], SubArray(hierarchy.ratio_array[2][l]),
+      SubArray(hierarchy.ratio_array[1][l]),
+      SubArray(hierarchy.ratio_array[0][l]), doutput, dcoarse, dcoeff_f,
+      dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, 0, 0, 0,
+      hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l],
+      queue_idx);
+
+  verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l],
+                     hierarchy.dofs[0][l], doutput.data(), doutput.getLd(0),
+                     doutput.getLd(1), doutput.getLd(0),
+                     prefix + "gpk_rev_3d" + "_level_" + std::to_string(l),
+                     multidim_refactoring_store, multidim_refactoring_verify);
+
+  if (multidim_refactoring_debug_print) {
+    PrintSubarray("after coeff-restore", doutput);
+  }
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.hpp
new file mode 100644
index 0000000000..c074ed4414
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.hpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "CalcCoefficientsPointers.hpp"
+#include "GridProcessingKernel.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE_ND
+#define MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE_ND
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CoefficientsRestoreND(Hierarchy<D, T, DeviceType> &hierarchy,
+                           SubArray<D, T, DeviceType> dinput1,
+                           SubArray<D, T, DeviceType> dinput2,
+                           SubArray<D, T, DeviceType> &doutput, SIZE l,
+                           int queue_idx) {
+
+  std::string prefix = "decomp_";
+  if (sizeof(T) == sizeof(double))
+    prefix += "d_";
+  if (sizeof(T) == sizeof(float))
+    prefix += "f_";
+  for (int d = 0; d < D; d++)
+    prefix += std::to_string(hierarchy.shape[d]) + "_";
+
+  SubArray<D, T, DeviceType> dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf,
+      dcoeff_rf, dcoeff_rc, dcoeff_rcf;
+
+  DIM curr_dims[3];
+  int unprocessed_idx = 0;
+
+  // printf("interpolate-restore 1-3D\n");
+  curr_dims[0] = 0;
+  curr_dims[1] = 1;
+  curr_dims[2] = 2;
+  dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+  doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+
+  CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f,
+                           dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+                           dcoeff_rcf);
+
+  GpkRev<D, 3, T, true, false, 1, DeviceType>().Execute(
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+      hierarchy.unprocessed_n[unprocessed_idx],
+      SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+      curr_dims[1], curr_dims[0],
+      SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+      SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+      SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
+      dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf,
+      0, 0, 0, hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
+      hierarchy.dofs[curr_dims[0]][l], queue_idx);
+
+  for (DIM d = 3; d < D; d += 2) {
+    // LwpkReo<D, T, COPY, DeviceType>().Execute(doutput, dinput1, queue_idx);
+    CopyND(doutput, dinput1, queue_idx);
+
+    // printf("interpolate-restore %u-%uD\n", d+1, d+2);
+    curr_dims[0] = 0;
+    curr_dims[1] = d;
+    curr_dims[2] = d + 1;
+    dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+    doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+    CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse,
+                             dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf,
+                             dcoeff_rc, dcoeff_rcf);
+
+    if (D - d == 1) {
+      unprocessed_idx += 1;
+      GpkRev<D, 2, T, true, false, 2, DeviceType>().Execute(
+          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+          hierarchy.unprocessed_n[unprocessed_idx],
+          SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+          curr_dims[1], curr_dims[0],
+          SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+          SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+          SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
+          dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+          dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
+          hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
+          queue_idx);
+
+    } else { // D - d >= 2
+      unprocessed_idx += 2;
+      GpkRev<D, 3, T, true, false, 2, DeviceType>().Execute(
+          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+          hierarchy.unprocessed_n[unprocessed_idx],
+          SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+          curr_dims[1], curr_dims[0],
+          SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+          SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+          SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
+          dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+          dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
+          hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
+          queue_idx);
+    }
+  }
+  // Done interpolation-restore on doutput
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D("After interpolation reverse-reorder", doutput);
+  } // debug
+
+  unprocessed_idx = 0;
+
+  // printf("reorder-restore 1-3D\n");
+  curr_dims[0] = 0;
+  curr_dims[1] = 1;
+  curr_dims[2] = 2;
+  dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+  dinput1.project(curr_dims[0], curr_dims[1],
+                  curr_dims[2]); // reuse input1 as temp space
+
+  CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput2, dcoarse, dcoeff_f,
+                           dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+                           dcoeff_rcf);
+
+  GpkRev<D, 3, T, false, false, 1, DeviceType>().Execute(
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+      hierarchy.unprocessed_n[unprocessed_idx],
+      SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+      curr_dims[1], curr_dims[0],
+      SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+      SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+      SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
+      dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf,
+      0, 0, 0, hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
+      hierarchy.dofs[curr_dims[0]][l], queue_idx);
+
+  DIM D_reduced = D % 2 == 0 ? D - 1 : D - 2;
+  for (DIM d = 3; d < D_reduced; d += 2) {
+    // printf("reorder-reverse\n");
+    // copy back to input2 for reordering again
+    // LwpkReo<D, T, COPY, DeviceType>().Execute(dinput1, dinput2, queue_idx);
+    CopyND(dinput1, dinput2, queue_idx);
+
+    // printf("reorder-restore %u-%uD\n", d+1, d+2);
+    curr_dims[0] = 0;
+    curr_dims[1] = d;
+    curr_dims[2] = d + 1;
+    dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+    dinput1.project(curr_dims[0], curr_dims[1],
+                    curr_dims[2]); // reuse input1 as temp output
+
+    CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput2, dcoarse,
+                             dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf,
+                             dcoeff_rc, dcoeff_rcf);
+
+    unprocessed_idx += 2;
+    GpkRev<D, 3, T, false, false, 2, DeviceType>().Execute(
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+        hierarchy.unprocessed_n[unprocessed_idx],
+        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+        curr_dims[1], curr_dims[0],
+        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
+        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+        dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
+        hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
+        queue_idx);
+  }
+
+  // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+2);
+  curr_dims[0] = 0;
+  curr_dims[1] = D_reduced;
+  curr_dims[2] = D_reduced + 1;
+  dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+  doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
+  CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f,
+                           dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+                           dcoeff_rcf);
+
+  if (D - D_reduced == 1) {
+    // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+1);
+    unprocessed_idx += 1;
+    GpkRev<D, 2, T, false, true, 2, DeviceType>().Execute(
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+        hierarchy.unprocessed_n[unprocessed_idx],
+        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+        curr_dims[1], curr_dims[0],
+        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
+        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+        dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
+        hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
+        queue_idx);
+  } else { // D - D_reduced >= 2
+    // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+2);
+    unprocessed_idx += 2;
+    GpkRev<D, 3, T, false, true, 2, DeviceType>().Execute(
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+        hierarchy.unprocessed_n[unprocessed_idx],
+        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
+        curr_dims[1], curr_dims[0],
+        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
+        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
+        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
+        dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
+        hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
+        queue_idx);
+  }
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D("After coeff restore", doutput);
+  } // debug
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.h b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.h
deleted file mode 100644
index c60b73de2c..0000000000
--- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_GRID_PROCESSING_KERNEL
-#define MGARD_X_GRID_PROCESSING_KERNEL
-
-#include "../../Common.h"
-
-namespace mgard_x {
-
-template <DIM D_GLOBAL, DIM D_LOCAL, typename T, bool INTERPOLATION,
-          bool CALC_COEFF, int TYPE>
-void gpk_reo(Handle<D_GLOBAL, T> &handle, SIZE *shape_h, SIZE *shape_d,
-             SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n,
-             DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c,
-             DIM curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv,
-             LENGTH lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2,
-             T *dwf, LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1,
-             LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf,
-             LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1,
-             LENGTH lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf,
-             LENGTH lddwrcf1, LENGTH lddwrcf2, int queue_idx, int config);
-
-template <DIM D_GLOBAL, DIM D_LOCAL, typename T, bool INTERPOLATION,
-          bool COEFF_RESTORE, int TYPE>
-void gpk_rev(Handle<D_GLOBAL, T> &handle, SIZE *shape_h, SIZE *shape_d,
-             SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n,
-             DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c,
-             DIM curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv,
-             LENGTH lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2,
-             T *dwf, LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1,
-             LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf,
-             LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1,
-             LENGTH lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf,
-             LENGTH lddwrcf1, LENGTH lddwrcf2, SIZE svr, SIZE svc, SIZE svf,
-             SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx, int config);
-
-template <DIM D_GLOBAL, DIM D_LOCAL, typename T, bool INTERPOLATION,
-          bool CALC_COEFF, int TYPE, typename DeviceType>
-class GpkReo;
-
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.hpp
index f022bcdf77..f56c5188c2 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.hpp
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.hpp
@@ -2267,15 +2267,12 @@ class GpkReo : public AutoTuner<DeviceType> {
                SubArray<D_GLOBAL, T, DeviceType> wrc,
                SubArray<D_GLOBAL, T, DeviceType> wrcf, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(shape.dataHost()[curr_dim_f]) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_cc[arch][prec][range_l];
     int config =
         AutoTuner<DeviceType>::autoTuningTable.gpk_reo_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define GPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -2291,22 +2288,26 @@ class GpkReo : public AutoTuner<DeviceType> {
         curr_dim_c, curr_dim_f, ratio_r, ratio_c, ratio_f, v, w, wf, wc, wr,   \
         wcf, wrf, wrc, wrcf, queue_idx);                                       \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    GPK(0)
-    GPK(1)
-    GPK(2)
-    GPK(3)
-    GPK(4)
-    GPK(5)
-    GPK(6)
+    GPK(6) if (!ret.success) config--;
+    GPK(5) if (!ret.success) config--;
+    GPK(4) if (!ret.success) config--;
+    GPK(3) if (!ret.success) config--;
+    GPK(2) if (!ret.success) config--;
+    GPK(1) if (!ret.success) config--;
+    GPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for GpkReo.\n";
+      exit(-1);
+    }
 #undef GPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -4686,15 +4687,12 @@ class GpkRev : public AutoTuner<DeviceType> {
                SubArray<D_GLOBAL, T, DeviceType> wrcf, SIZE svr, SIZE svc,
                SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(shape.dataHost()[curr_dim_f]) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_cc[arch][prec][range_l];
     int config =
         AutoTuner<DeviceType>::autoTuningTable.gpk_rev_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define GPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -4710,22 +4708,26 @@ class GpkRev : public AutoTuner<DeviceType> {
         curr_dim_c, curr_dim_f, ratio_r, ratio_c, ratio_f, v, w, wf, wc, wr,   \
         wcf, wrf, wrc, wrcf, svr, svc, svf, nvr, nvc, nvf, queue_idx);         \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    GPK(0)
-    GPK(1)
-    GPK(2)
-    GPK(3)
-    GPK(4)
-    GPK(5)
-    GPK(6)
+    GPK(6) if (!ret.success) config--;
+    GPK(5) if (!ret.success) config--;
+    GPK(4) if (!ret.success) config--;
+    GPK(3) if (!ret.success) config--;
+    GPK(2) if (!ret.success) config--;
+    GPK(1) if (!ret.success) config--;
+    GPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for GpkRev.\n";
+      exit(-1);
+    }
 #undef GPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -4734,4512 +4736,6 @@ class GpkRev : public AutoTuner<DeviceType> {
   }
 };
 
-// template <DIM D_GLOBAL, DIM D_LOCAL, typename T, SIZE R, SIZE C, SIZE F,
-//           bool INTERPOLATION, bool CALC_COEFF, int TYPE>
-// __global__ void
-// _gpk_reo(SIZE *shape, SIZE *shape_c, SIZE *ldvs, SIZE *ldws, DIM
-// unprocessed_n,
-//          DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM
-//          curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv, LENGTH
-//          lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2, T *dwf,
-//          LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1, LENGTH lddwc2,
-//          T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf, LENGTH lddwcf1,
-//          LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH lddwrf2, T *dwrc,
-//          LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, LENGTH lddwrcf1, LENGTH
-//          lddwrcf2) {
-
-//   // bool debug = false;
-//   // if (FunctorBase<DeviceType>::GetBlockIdX() == 0 &&
-//   FunctorBase<DeviceType>::GetBlockIdY() ==0 &&
-//   FunctorBase<DeviceType>::GetBlockIdZ() == 0 &&
-//   //     threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) debug =
-//   //     false;
-
-//   // volatile clock_t start = 0;
-//   // volatile clock_t end = 0;
-//   // volatile unsigned long long sum_time = 0;
-
-//   LENGTH threadId = (threadIdx.z * (FunctorBase<DeviceType>::GetBlockDimX() *
-//   FunctorBase<DeviceType>::GetBlockDimY())) +
-//                     (threadIdx.y * FunctorBase<DeviceType>::GetBlockDimX()) +
-//                     threadIdx.x;
-
-//   SIZE nr, nc, nf;
-//   SIZE nr_c, nc_c, nf_c;
-//   SIZE r, c, f;
-//   SIZE rest_r, rest_c, rest_f;
-//   SIZE nr_p, nc_p, nf_p;
-//   SIZE rest_r_p, rest_c_p, rest_f_p;
-//   SIZE r_sm, c_sm, f_sm;
-//   SIZE r_sm_ex, c_sm_ex, f_sm_ex;
-//   SIZE r_gl, c_gl, f_gl;
-//   SIZE r_gl_ex, c_gl_ex, f_gl_ex;
-//   T res;
-//   bool in_next = true;
-
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = (F/2) * 2 + 1;
-//   SIZE ldsm2 = (C/2) * 2 + 1;
-
-//   T *v_sm = sm; sm += ((F/2) * 2 + 1) * ((C/2) * 2 + 1) * ((R/2) * 2 + 1);
-//   T *ratio_f_sm = sm; sm += (F/2) * 2;
-//   T *ratio_c_sm = sm; sm += (C/2) * 2;
-//   T *ratio_r_sm = sm; sm += (R/2) * 2;
-
-//   SIZE * sm_size = (SIZE*)sm;
-//   SIZE *shape_sm = sm_size; sm_size += D_GLOBAL;
-//   SIZE *shape_c_sm = sm_size; sm_size += D_GLOBAL;
-//   SIZE *ldvs_sm = sm_size; sm_size += D_GLOBAL;
-//   SIZE *ldws_sm = sm_size; sm_size += D_GLOBAL;
-//   sm = (T*)sm_size;
-
-//   DIM * sm_dim = (DIM*)sm;
-//   DIM *unprocessed_dims_sm = sm_dim; sm_dim += D_GLOBAL;
-//   sm = (T*)sm_dim;
-
-//   SIZE idx[D_GLOBAL];
-//   if (threadId < D_GLOBAL) {
-//     shape_sm[threadId] = shape[threadId];
-//     shape_c_sm[threadId] = shape_c[threadId];
-//     ldvs_sm[threadId] = ldvs[threadId];
-//     ldws_sm[threadId] = ldws[threadId];
-//   }
-
-//   if (threadId < unprocessed_n) {
-//     unprocessed_dims_sm[threadId] = unprocessed_dims[threadId];
-//   }
-//   __syncthreads();
-
-//   for (DIM d = 0; d < D_GLOBAL; d++)
-//     idx[d] = 0;
-
-//   nr = shape_sm[curr_dim_r];
-//   nc = shape_sm[curr_dim_c];
-//   nf = shape_sm[curr_dim_f];
-
-//   nr_c = shape_c_sm[curr_dim_r];
-//   nc_c = shape_c_sm[curr_dim_c];
-//   nf_c = shape_c_sm[curr_dim_f];
-
-//   if (D_LOCAL < 3) {
-//     nr = 1;
-//     nr_c = 1;
-//   }
-//   if (D_LOCAL < 2) {
-//     nc = 1;
-//     nc_c = 1;
-//   }
-
-//   r = FunctorBase<DeviceType>::GetBlockIdZ() *
-//   FunctorBase<DeviceType>::GetBlockDimZ(); c =
-//   FunctorBase<DeviceType>::GetBlockIdY() *
-//   FunctorBase<DeviceType>::GetBlockDimY(); SIZE bidx =
-//   FunctorBase<DeviceType>::GetBlockIdX(); SIZE firstD =
-//   div_roundup(shape_sm[0] - 1, FunctorBase<DeviceType>::GetBlockDimX()); f =
-//   (bidx % firstD) * FunctorBase<DeviceType>::GetBlockDimX();
-
-//   bidx /= firstD;
-
-//   // if (debug) printf("n: %d %d %d rcf: %d %d %d\n", nr, nc, nf, r, c, f);
-//   rest_r = nr - r;
-//   rest_c = nc - c;
-//   rest_f = nf - f;
-
-//   nr_p = nr;
-//   nc_p = nc;
-//   nf_p = nf;
-
-//   rest_r_p = rest_r;
-//   rest_c_p = rest_c;
-//   rest_f_p = rest_f;
-
-//   if (nr % 2 == 0) {
-//     nr_p = nr + 1;
-//     rest_r_p = nr_p - r;
-//   }
-//   if (nc % 2 == 0) {
-//     nc_p = nc + 1;
-//     rest_c_p = nc_p - c;
-//   }
-//   if (nf % 2 == 0) {
-//     nf_p = nf + 1;
-//     rest_f_p = nf_p - f;
-//   }
-
-//   for (DIM d = 0; d < D_GLOBAL; d++) {
-//     if (D_LOCAL == 3 && d != curr_dim_r && d != curr_dim_c && d !=
-//     curr_dim_f) {
-//       idx[d] = bidx % shape_sm[d];
-//       bidx /= shape_sm[d];
-//       if (idx[d] >= shape_c_sm[d])
-//         in_next = false;
-//     }
-//     if (D_LOCAL == 2 && d != curr_dim_c && d != curr_dim_f) {
-//       idx[d] = bidx % shape_sm[d];
-//       bidx /= shape_sm[d];
-//       if (idx[d] >= shape_c_sm[d])
-//         in_next = false;
-//     }
-//   }
-
-//   int skip = 0;
-//   #pragma unroll 1
-//   for (DIM t = 0; t < D_GLOBAL; t++) {
-//     for (DIM k = 0; k < unprocessed_n; k++) {
-//       if (t == unprocessed_dims_sm[k] &&
-//           (shape_sm[t] % 2 == 1 && idx[t] % 2 == 1 ||
-//            shape_sm[t] % 2 == 0 && idx[t] % 2 == 1 &&
-//                idx[t] != shape_sm[t] - 1)) {
-//         skip = 1;
-//       }
-//     }
-//   }
-
-//   // if (FunctorBase<DeviceType>::GetBlockIdX() == 0 &&
-//   FunctorBase<DeviceType>::GetBlockIdY() == 0 &&
-//   FunctorBase<DeviceType>::GetBlockIdZ() == 0) {
-//   // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
-//   //   printf("total_idx_sm: %d %d %d %d (skip: %d)\n", idx[3], idx[2],
-//   idx[1],
-//   //   idx[0], skip);
-//   // }
-//   // }
-
-//   LENGTH other_offset_v = get_idx<D_GLOBAL>(ldvs_sm, idx);
-//   LENGTH other_offset_w = get_idx<D_GLOBAL>(ldws_sm, idx);
-
-//   dv = dv + other_offset_v;
-//   dw = dw + other_offset_w;
-//   dwr = dwr + other_offset_w;
-//   dwc = dwc + other_offset_w;
-//   dwf = dwf + other_offset_w;
-//   dwrf = dwrf + other_offset_w;
-//   dwrc = dwrc + other_offset_w;
-//   dwcf = dwcf + other_offset_w;
-//   dwrcf = dwrcf + other_offset_w;
-
-//   if (TYPE == 2) {
-//     dwf = dw;
-//     dwcf = dwc;
-//     dwrf = dwr;
-//     dwrcf = dwrc;
-//   }
-//   __syncthreads();
-//   // if (!skip)
-//   {
-//     r_sm = threadIdx.z;
-//     c_sm = threadIdx.y;
-//     f_sm = threadIdx.x;
-
-//     r_sm_ex = (R/2) * 2;
-//     c_sm_ex = (C/2) * 2;
-//     f_sm_ex = (F/2) * 2;
-
-//     r_gl = r + r_sm;
-//     r_gl_ex = r + (R/2) * 2;
-//     c_gl = c + c_sm;
-//     c_gl_ex = c + (C/2) * 2;
-//     f_gl = f + f_sm;
-//     f_gl_ex = f + (F/2) * 2;
-
-//     //  __syncthreads();
-//     // if (r_sm == 0 && c_sm == 0 && f_sm == 0) {
-//     //   //printf("setting zeros\n");
-//     //   for (int i = 0; i < (R/2) * 2 + 1; i++) {
-//     //     for (int j = 0; j < (C/2) * 2 + 1; j++) {
-//     //       for (int k = 0; k < (F/2) * 2 + 1; k++) {
-//     //         v_sm[get_idx(ldsm1, ldsm2, i, j, k)] = 0.0;
-//     //       }
-//     //     }
-//     //   }
-//     //   //printf("done zeros\n");
-//     // }
-//     //  __syncthreads();
-//     /* Load v */
-//     // loading extra rules
-//     // case 1: input = odd (non-padding required)
-//     //    case 1.a: block size < rest (need to load extra);
-//     //    case 1.b: block size > rest (NO need to load extra);
-//     // case 2: input = even (padding requried)
-//     //    case 2.a: block size < rest (need to load extra);
-//     //    case 2.b: block size >= rest (NO need to load extra, but need
-//     //    padding);
-
-//     // Load from dv
-//     if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) {
-
-//       // load cubic
-//       // asm volatile("membar.cta;");
-//       // start = clock64();
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)];
-//       // if (FunctorBase<DeviceType>::GetBlockIdX()==0 &&
-//       FunctorBase<DeviceType>::GetBlockIdY()==0&&FunctorBase<DeviceType>::GetBlockIdZ()==0)
-//       {
-//       //   printf("load (%d %d %d) %f <- %d+(%d %d %d) (ld: %d %d)\n",
-//       //           r_sm, c_sm, f_sm,
-//       //           dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)],
-//       //           other_offset_v+r_gl, c_gl, f_gl, lddv1, lddv2);
-//       // }
-//       if (r_sm == 0) {
-//         if (rest_r > (R/2) * 2) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)] =
-//               dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)];
-//         }
-//       }
-//       if (c_sm == 0) {
-//         if (rest_c > (C/2) * 2) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)] =
-//               dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)];
-//         }
-//       }
-//       if (f_sm == 0) {
-//         if (rest_f > (F/2) * 2) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)] =
-//               dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)];
-//         }
-//       }
-//       if (c_sm == 0 && f_sm == 0) {
-//         if (rest_c > (C/2) * 2 && rest_f > (F/2) * 2) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)] =
-//               dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)];
-//         }
-//       }
-//       if (r_sm == 0 && f_sm == 0) {
-//         if (rest_r > (R/2) * 2 && rest_f > (F/2) * 2) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)] =
-//               dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)];
-//         }
-//       }
-//       if (r_sm == 0 && c_sm == 0) {
-//         if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)] =
-//               dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)];
-//         }
-//       }
-//       if (r_sm == 0 && c_sm == 0 && f_sm == 0) {
-//         if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2 && rest_f > (F/2) * 2) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)] =
-//               dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)];
-//         }
-//       }
-//     }
-
-//     __syncthreads();
-
-//     // apply padding is necessary
-//     if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) {
-
-//       // printf("load main[%d %d %d]:%f --> [%d %d %d] (%d %d %d)\n", r_gl,
-//       // c_gl, f_gl,
-//       //     dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)], r_sm, c_sm, f_sm,
-//       nr,
-//       //     nc, nf);
-
-//       // asm volatile("membar.cta;");
-//       // start = clock64() - start;
-//       // printf("[load main] block id %d,%d,%d elapsed %lu\n",
-//       FunctorBase<DeviceType>::GetBlockIdZ(),
-//       // FunctorBase<DeviceType>::GetBlockIdY(),
-//       FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64();
-
-//       // load extra surface
-
-//       if (r_sm == 0) {
-//         if (rest_r > (R/2) * 2) {
-//           // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)] =
-//           //     dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)];
-//           // printf("load-r[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl,
-//           f_gl,
-//           //   dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)], r_sm_ex, c_sm,
-//           //   f_sm);
-//         } else if (nr % 2 == 0) {
-//           // if (r == 16 && c == 0 && f == 0) {
-//           //   printf("padding (%d %d %d) %f <- (%f %f %f)\n", rest_r_p - 1,
-//           //   c_sm, f_sm,
-//           //         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)],
-//           rest_r
-//           //         - 1, c_sm, f_sm);
-//           //   padded = true;
-//           //   aa = v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)];
-//           //   bb = v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)];
-//           // }
-//           v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)];
-//         }
-//       }
-
-//       if (c_sm == 0) {
-//         if (rest_c > (C/2) * 2) {
-//           // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)] =
-//           //     dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)];
-//           // printf("load-c[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl_ex,
-//           f_gl,
-//           //   dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)], r_sm, c_sm_ex,
-//           //   f_sm);
-//         } else if (nc % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm)];
-//         }
-//       }
-
-//       if (f_sm == 0) {
-//         if (rest_f > (F/2) * 2) {
-//           // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)] =
-//           //     dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)];
-//           // printf("load-f[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl,
-//           f_gl_ex,
-//           //   dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)], r_sm, c_sm,
-//           //   f_sm_ex);
-//         } else if (nf % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p - 1)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f - 1)];
-//         }
-//       }
-
-//       // load extra edges
-//       if (c_sm == 0 && f_sm == 0) {
-//         if (rest_c > (C/2) * 2 && rest_f > (F/2) * 2) {
-//           // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)] =
-//           //     dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)];
-//           // printf("load-cf[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl_ex,
-//           // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)],
-//           r_sm,
-//           // c_sm_ex, f_sm_ex);
-//         } else if (rest_c <= (C/2) * 2 && rest_f <= (F/2) * 2 && nc % 2 == 0
-//         &&
-//                    nf % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, rest_f_p - 1)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, rest_f - 1)];
-//         } else if (rest_c > (C/2) * 2 && rest_f <= (F/2) * 2 && nf % 2 == 0)
-//         {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f - 1)];
-//         } else if (rest_c <= (C/2) * 2 && rest_f > (F/2) * 2 && nc % 2 == 0)
-//         {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm_ex)];
-//         }
-//       }
-
-//       if (r_sm == 0 && f_sm == 0) {
-//         if (rest_r > (R/2) * 2 && rest_f > (F/2) * 2) {
-//           // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)] =
-//           //     dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)];
-//           // printf("load-rf[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl,
-//           // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)],
-//           // r_sm_ex, c_sm, f_sm_ex);
-//         } else if (rest_r <= (R/2) * 2 && rest_f <= (F/2) * 2 && nr % 2 == 0
-//         &&
-//                    nf % 2 == 0) {
-//           // printf("padding (%d %d %d) <- (%d %d %d)\n", rest_r_p - 1, c_sm,
-//           // rest_f_p - 1, rest_r - 1, c_sm, rest_f - 1);
-//           v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, rest_f_p - 1)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, rest_f - 1)];
-//         } else if (rest_r > (R/2) * 2 && rest_f <= (F/2) * 2 && nf % 2 == 0)
-//         {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f - 1)];
-//         } else if (rest_r <= (R/2) * 2 && rest_f > (F/2) * 2 && nr % 2 == 0)
-//         {
-//           v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm_ex)];
-//         }
-//       }
-
-//       if (r_sm == 0 && c_sm == 0) {
-//         if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2) {
-//           // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)] =
-//           //     dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)];
-//           // printf("load-rc[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex,
-//           c_gl_ex,
-//           // f_gl, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)],
-//           r_sm_ex,
-//           // c_sm_ex, f_sm);
-//         } else if (rest_r <= (R/2) * 2 && rest_c <= (C/2) * 2 && nr % 2 == 0
-//         &&
-//                    nc % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)];
-//           // printf("padding (%d %d %d) <- (%d %d %d): %f\n", rest_r_p - 1,
-//           // rest_c_p - 1, f_sm, rest_r - 1, rest_c - 1, f_sm,
-//           // v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)]);
-//         } else if (rest_r > (R/2) * 2 && rest_c <= (C/2) * 2 && nc % 2 == 0)
-//         {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, f_sm)];
-//         } else if (rest_r <= (R/2) * 2 && rest_c > (C/2) * 2 && nr % 2 == 0)
-//         {
-//           v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, f_sm)];
-//         }
-//       }
-//       // load extra vertex
-
-//       if (r_sm == 0 && c_sm == 0 && f_sm == 0) {
-//         if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2 && rest_f > (F/2) * 2) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)] =
-//               dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)];
-//           // printf("load-rcf[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex,
-//           c_gl_ex,
-//           // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)],
-//           // r_sm_ex, c_sm_ex, f_sm_ex);
-//         } else if (rest_r <= (R/2) * 2 && rest_c <= (C/2) * 2 && rest_f <=
-//         (F/2) * 2 &&
-//                    nr % 2 == 0 && nc % 2 == 0 && nf % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1,
-//                        rest_f_p - 1)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, rest_f -
-//               1)];
-//         } else if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2 && rest_f <=
-//         (F/2) * 2 &&
-//                    nf % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f - 1)];
-//         } else if (rest_r > (R/2) * 2 && rest_c <= (C/2) * 2 && rest_f >
-//         (F/2) * 2 &&
-//                    nc % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, f_sm_ex)];
-//         } else if (rest_r > (R/2) * 2 && rest_c <= (C/2) * 2 && rest_f <=
-//         (F/2) * 2 &&
-//                    nc % 2 == 0 && nf % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p - 1)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, rest_f - 1)];
-//         } else if (rest_r <= (R/2) * 2 && rest_c > (C/2) * 2 && rest_f >
-//         (F/2) * 2 &&
-//                    nr % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, f_sm_ex)];
-//         } else if (rest_r <= (R/2) * 2 && rest_c > (C/2) * 2 && rest_f <=
-//         (F/2) * 2 &&
-//                    nr % 2 == 0 && nf % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p - 1)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, rest_f - 1)];
-//         } else if (rest_r <= (R/2) * 2 && rest_c <= (C/2) * 2 && rest_f >
-//         (F/2) * 2 &&
-//                    nr % 2 == 0 && nc % 2 == 0) {
-//           v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm_ex)];
-//         }
-//       }
-
-//       // asm volatile("membar.cta;");
-//       // start = clock64() - start;
-//       // printf("[load extra] block id %d,%d,%d elapsed %lu\n",
-//       FunctorBase<DeviceType>::GetBlockIdZ(),
-//       // FunctorBase<DeviceType>::GetBlockIdY(),
-//       FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64();
-
-//       // load dist
-//       if (c_sm == 0 && f_sm == 0 && r_sm < rest_r_p - 2) {
-//         // printf("%d/%d load %f\n", r_sm, rest_r - 2, dratio_r[r + r_sm]);
-//         ratio_r_sm[r_sm] = dratio_r[r + r_sm];
-//         // if (nr % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p && r_sm == 0) {
-//         //   ratio_r_sm[rest_r_p - 3] = 0.5;
-//         // }
-//       }
-//       if (r_sm == 0 && f_sm == 0 && c_sm < rest_c_p - 2) {
-//         ratio_c_sm[c_sm] = dratio_c[c + c_sm];
-//         // if (nc % 2 == 0 && (C/2) * 2 + 1 >= rest_c_p && c_sm == 0) {
-//         //   ratio_c_sm[rest_c_p - 3] = 0.5;
-//         // }
-//       }
-//       if (c_sm == 0 && r_sm == 0 && f_sm < rest_f_p - 2) {
-//         ratio_f_sm[f_sm] = dratio_f[f + f_sm];
-//         // if (nf % 2 == 0 && (F/2) * 2 + 1 >= rest_f_p && f_sm == 0) {
-//         //   ratio_f_sm[rest_f_p - 3] = 0.5;
-//         // }
-//       }
-
-//       // if (r == 0 && c == 0 && f == 0 && r_sm == 0 && c_sm == 0 && f_sm ==
-//       0)
-//       // {
-//       //   printf("ratio:");
-//       //   for (int i = 0; i < (R/2) * 2 + 1; i++) {
-//       //     printf("%2.2f ", ratio_r_sm[i]);
-//       //   }
-//       //   printf("\n");
-//       // }
-
-//     } // restrict boundary
-
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[load ratio] block id %d,%d,%d elapsed %lu\n",
-//     FunctorBase<DeviceType>::GetBlockIdZ(),
-//     // FunctorBase<DeviceType>::GetBlockIdY(),
-//     FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64();
-
-//     // __syncthreads();
-//     // // debug print
-//     // if (debug) {
-//     //   printf("in config: %d %d %d (%d %d %d)\n", (R/2), (C/2), (F/2),
-//     r,c,f);
-//     //   printf("rest_p: %d %d %d\n", rest_r_p, rest_c_p, rest_f_p);
-//     //   bool print = false;
-//     //   for (int i = 0; i < (R/2) * 2 + 1; i++) {
-//     //     for (int j = 0; j < (C/2) * 2 + 1; j++) {
-//     //       for (int k = 0; k < (F/2) * 2 + 1; k++) {
-//     //         // if (abs(v_sm[get_idx(ldsm1, ldsm2, i, j, k)]) > 10000) {
-//     //           // print = true;
-//     //           // printf("(block %d %d %d) %2.2f \n", r,c,f,
-//     //           v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//     //         // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//     //         // }
-//     //         printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//     //       }
-//     //       printf("\n");
-//     //     }
-//     //     printf("\n");
-//     //   }
-//     // }
-//     __syncthreads();
-
-//     if (dw && threadId < (R/2) * (C/2) * (F/2)) {
-//       r_sm = (threadId / ((C/2) * (F/2))) * 2;
-//       c_sm = ((threadId % ((C/2) * (F/2))) / (F/2)) * 2;
-//       f_sm = ((threadId % ((C/2) * (F/2))) % (F/2)) * 2;
-//       r_gl = r / 2 + threadId / ((C/2) * (F/2));
-//       c_gl = c / 2 + threadId % ((C/2) * (F/2)) / (F/2);
-//       f_gl = f / 2 + threadId % ((C/2) * (F/2)) % (F/2);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               ;
-//             }
-//             if (CALC_COEFF) {
-//               if (in_next && f_gl < nf_c) {
-//                 ;
-//               } else {
-//                 res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//           }
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       }
-//     }
-
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[store coarse] block id %d,%d,%d elapsed %lu\n",
-//     FunctorBase<DeviceType>::GetBlockIdZ(),
-//     // FunctorBase<DeviceType>::GetBlockIdY(),
-//     FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64(); int
-//     base = 0;
-//     // printf("TYPE =%d \n", TYPE);
-//     // printf("%d == %d && %llu >= %d && %llu < %d\n", r + (R/2) * 2, nr_p -
-//     1,
-//     // threadId, base, threadId, base + (C/2) * (F/2));
-
-//     if (dw && r + (R/2) * 2 == nr_p - 1 && threadId >= base &&
-//         threadId < base + (C/2) * (F/2)) {
-//       r_sm = (R/2) * 2;
-//       c_sm = ((threadId - base) / (F/2)) * 2;
-//       f_sm = ((threadId - base) % (F/2)) * 2;
-//       r_gl = r / 2 + (R/2);
-//       c_gl = c / 2 + (threadId - base) / (F/2);
-//       f_gl = f / 2 + (threadId - base) % (F/2);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               ;
-//             }
-//             if (CALC_COEFF) {
-//               if (in_next && f_gl < nf_c) {
-//                 ;
-//               } else {
-//                 res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//           }
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       }
-//     }
-
-//     base += (C/2) * (F/2); // ROUND_UP_WARP((C/2) * (F/2)) * WARP_SIZE;
-//     if (dw && c + (C/2) * 2 == nc_p - 1 && threadId >= base &&
-//         threadId < base + (R/2) * (F/2)) {
-//       r_sm = ((threadId - base) / (F/2)) * 2;
-//       c_sm = (C/2) * 2;
-//       f_sm = ((threadId - base) % (F/2)) * 2;
-//       r_gl = r / 2 + (threadId - base) / (F/2);
-//       c_gl = c / 2 + (C/2);
-//       f_gl = f / 2 + (threadId - base) % (F/2);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               ;
-//             }
-//             if (CALC_COEFF) {
-//               if (in_next && f_gl < nf_c) {
-//                 ;
-//               } else {
-//                 res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//           }
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       }
-//       // printf("(%d %d %d) (%d %d %d) %f\n",
-//       //         r_sm, c_sm, f_sm, r_gl, c_gl, f_gl, dwork[get_idx(lddv1,
-//       lddv2,
-//       //         r_gl, c_gl, f_gl)]);
-//     }
-
-//     base += (R/2) * (F/2); // ROUND_UP_WARP((R/2) * (F/2)) * WARP_SIZE;
-//     // printf("%d %d\n", base,  threadId);
-//     if (dw && f + (F/2) * 2 == nf_p - 1 && threadId >= base &&
-//         threadId < base + (R/2) * (C/2)) {
-//       r_sm = ((threadId - base) / (C/2)) * 2;
-//       c_sm = ((threadId - base) % (C/2)) * 2;
-//       f_sm = (F/2) * 2;
-//       r_gl = r / 2 + (threadId - base) / (C/2);
-//       c_gl = c / 2 + (threadId - base) % (C/2);
-//       f_gl = f / 2 + (F/2);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               ;
-//             }
-//             if (CALC_COEFF) {
-//               if (in_next && f_gl < nf_c) {
-//                 ;
-//               } else {
-//                 res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//           }
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       }
-//     }
-
-//     base += (R/2) * (C/2); // ROUND_UP_WARP((R/2) * (C/2)) * WARP_SIZE;
-//     // load extra edges
-//     if (dw && c + (C/2) * 2 == nc_p - 1 && f + (F/2) * 2 == nf_p - 1 &&
-//         threadId >= base && threadId < base + (R/2)) {
-//       r_sm = (threadId - base) * 2;
-//       c_sm = (C/2) * 2;
-//       f_sm = (F/2) * 2;
-//       r_gl = r / 2 + threadId - base;
-//       c_gl = c / 2 + (C/2);
-//       f_gl = f / 2 + (F/2);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               ;
-//             }
-//             if (CALC_COEFF) {
-//               if (in_next && f_gl < nf_c) {
-//                 ;
-//               } else {
-//                 res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//           }
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       }
-//     }
-
-//     base += (R/2); // ROUND_UP_WARP((R/2)) * WARP_SIZE;
-//     // if (TYPE == 2) printf("%d %d, %d, %llu, %d\n",dw == NULL, f + (F/2) *
-//     2, nf_p
-//     // - 1, threadId, (C/2));
-//     if (dw && r + (R/2) * 2 == nr_p - 1 && f + (F/2) * 2 == nf_p - 1 &&
-//         threadId >= base && threadId < base + (C/2)) {
-//       r_sm = (R/2) * 2;
-//       c_sm = (threadId - base) * 2;
-//       f_sm = (F/2) * 2;
-//       r_gl = r / 2 + (R/2);
-//       c_gl = c / 2 + threadId - base;
-//       f_gl = f / 2 + (F/2);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               ;
-//             }
-//             if (CALC_COEFF) {
-//               if (in_next && f_gl < nf_c) {
-//                 ;
-//               } else {
-//                 res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//           }
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       }
-//       // printf("store[%d %d %d]: %f\n", r_sm, c_sm, f_sm,
-//       v_sm[get_idx(ldsm1,
-//       // ldsm2, r_sm, c_sm, f_sm)]);
-//     }
-
-//     base += (C/2); // ROUND_UP_WARP((C/2)) * WARP_SIZE;
-//     if (dw && r + (R/2) * 2 == nr_p - 1 && c + (C/2) * 2 == nc_p - 1 &&
-//         threadId >= base && threadId < base + (F/2)) {
-//       r_sm = (R/2) * 2;
-//       c_sm = (C/2) * 2;
-//       f_sm = (threadId - base) * 2;
-//       r_gl = r / 2 + (R/2);
-//       c_gl = c / 2 + (C/2);
-//       f_gl = f / 2 + threadId - base;
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               ;
-//             }
-//             if (CALC_COEFF) {
-//               if (in_next && f_gl < nf_c) {
-//                 ;
-//               } else {
-//                 res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//           }
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       }
-//     }
-//     base += (F/2); // ROUND_UP_WARP((F/2)) * WARP_SIZE;
-//     // // load extra vertex
-//     if (dw && r + (R/2) * 2 == nr_p - 1 && c + (C/2) * 2 == nc_p - 1 &&
-//         f + (F/2) * 2 == nf_p - 1 && threadId >= base && threadId < base + 1)
-//         {
-//       r_sm = (R/2) * 2;
-//       c_sm = (C/2) * 2;
-//       f_sm = (F/2) * 2;
-//       r_gl = r / 2 + (R/2);
-//       c_gl = c / 2 + (C/2);
-//       f_gl = f / 2 + (F/2);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               ;
-//             }
-//             if (CALC_COEFF) {
-//               if (in_next && f_gl < nf_c) {
-//                 ;
-//               } else {
-//                 res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//           }
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//           // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n",
-//           other_offset_w,
-//           // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//           // r_sm, c_sm, f_sm);
-//         }
-//       }
-//     }
-
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[store extra] block id %d,%d,%d elapsed %lu\n",
-//     FunctorBase<DeviceType>::GetBlockIdZ(),
-//     // FunctorBase<DeviceType>::GetBlockIdY(),
-//     FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64();
-
-//     // start = clock64();
-
-//     if (dwf && threadId >= (R/2) * (C/2) * (F/2) && threadId < (R/2) * (C/2)
-//     * (F/2) * 2) {
-//       r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / ((C/2) * (F/2))) * 2;
-//       c_sm = (((threadId - (R/2) * (C/2) * (F/2)) % ((C/2) * (F/2))) / (F/2))
-//       * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2)) % ((C/2) * (F/2))) %
-//       (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) /
-//       ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2)) %
-//       ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) * (C/2) *
-//       (F/2)) % ((C/2) * (F/2))) % (F/2); res = v_sm[get_idx(ldsm1, ldsm2,
-//       r_sm, c_sm, f_sm)]; if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                          ratio_f_sm[f_sm - 1]);
-//             }
-//             if (INTERPOLATION && CALC_COEFF) { // fused
-//               res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//             }
-//             if (!INTERPOLATION && CALC_COEFF) { // calc_coeff only
-//               res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl = 2 * f_gl + 1;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               ;
-//             }
-//             if (CALC_COEFF) {
-//               if (in_next && f_gl < nf_c) {
-//                 ;
-//               } else {
-//                 res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//           }
-//           dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-
-//       // if (nr == 70) printf("f-store: (%d %d %d) <- %f (%d %d %d)\n", r_gl,
-//       // c_gl, f_gl, v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], r_sm,
-//       c_sm,
-//       // f_sm);
-//       // asm volatile("membar.cta;");
-//       // start = clock64() - start;
-//       // printf("[(F/2)-store] block id %d,%d,%d elapsed %lu\n",
-//       FunctorBase<DeviceType>::GetBlockIdZ(),
-//       // FunctorBase<DeviceType>::GetBlockIdY(),
-//       FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64();
-//     }
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[(F/2)-store] block id %d,%d,%d elapsed %lu\n",
-//     FunctorBase<DeviceType>::GetBlockIdZ(),
-//     // FunctorBase<DeviceType>::GetBlockIdY(),
-//     FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64();
-
-//     // if (r_sm % 2 == 0 && c_sm % 2 != 0 && f_sm % 2 == 0) {
-
-//     if (dwc && threadId >= (R/2) * (C/2) * (F/2) * 2 && threadId < (R/2) *
-//     (C/2) * (F/2) * 3) {
-//       r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / ((C/2) * (F/2))) * 2;
-//       c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 2) % ((C/2) * (F/2))) /
-//       (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 2) %
-//       ((C/2) * (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2)
-//       * (F/2) * 2) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) *
-//       (C/2) * (F/2) * 2) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 +
-//       ((threadId - (R/2) * (C/2) * (F/2) * 2) % ((C/2) * (F/2))) % (F/2); res
-//       = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                          ratio_c_sm[c_sm - 1]);
-//             }
-//             if (INTERPOLATION && CALC_COEFF) {
-//               res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//             }
-//             if (!INTERPOLATION && CALC_COEFF) {
-//               res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                          ratio_c_sm[c_sm - 1]);
-//             }
-//             if (CALC_COEFF) { // no need to test in_next
-//               res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//         }
-//         // if (nr == 70) printf("c-store: (%d %d %d) <- %f (%d %d %d)\n",
-//         r_gl,
-//         // c_gl, f_gl, v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], r_sm,
-//         // c_sm, f_sm);
-//       }
-//     }
-
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[(C/2)-store] block id %d,%d,%d elapsed %lu\n",
-//     FunctorBase<DeviceType>::GetBlockIdZ(),
-//     // FunctorBase<DeviceType>::GetBlockIdY(),
-//     FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64();
-
-//     // if (r_sm % 2 != 0 && c_sm % 2 == 0 && f_sm % 2 == 0) {
-//     if (dwr && threadId >= (R/2) * (C/2) * (F/2) * 3 && threadId < (R/2) *
-//     (C/2) * (F/2) * 4) {
-//       r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 3) / ((C/2) * (F/2))) * 2 +
-//       1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 3) % ((C/2) * (F/2))) /
-//       (F/2)) * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 3) % ((C/2) *
-//       (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)
-//       * 3) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) *
-//       (F/2) * 3) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId -
-//       (R/2) * (C/2) * (F/2) * 3) % ((C/2) * (F/2))) % (F/2); res =
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                          ratio_r_sm[r_sm - 1]);
-//             }
-//             if (INTERPOLATION && CALC_COEFF) {
-//               res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//             }
-//             if (!INTERPOLATION && CALC_COEFF) {
-//               res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                          ratio_r_sm[r_sm - 1]);
-//             }
-//             if (CALC_COEFF) { // no need to test if in_next
-//               res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-//     }
-
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[(R/2)-store] block id %d,%d,%d elapsed %lu\n",
-//     FunctorBase<DeviceType>::GetBlockIdZ(),
-//     // FunctorBase<DeviceType>::GetBlockIdY(),
-//     FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64();
-//     __syncthreads();
-//     if (dwcf && threadId >= (R/2) * (C/2) * (F/2) * 4 && threadId < (R/2) *
-//     (C/2) * (F/2) * 5) {
-//       r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 4) / ((C/2) * (F/2))) * 2;
-//       c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) /
-//       (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 4) %
-//       ((C/2) * (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) *
-//       (C/2) * (F/2) * 4) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2)
-//       * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 +
-//       ((threadId - (R/2) * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) % (F/2); res
-//       = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm -
-//               1)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm +
-//                           1)], ratio_f_sm[f_sm - 1]);
-//               T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm -
-//               1)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm +
-//                           1)], ratio_f_sm[f_sm - 1]);
-//               T tmp = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//               res = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//             }
-//             if (INTERPOLATION && CALC_COEFF) {
-//               res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//             }
-//             if (!INTERPOLATION && CALC_COEFF) {
-//               res -= dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl = 2 * f_gl + 1;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                          ratio_c_sm[c_sm - 1]);
-//             }
-//             if (CALC_COEFF) { // not need to test if in_next
-//               res -= dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-//     }
-
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[CF-store] block id %d,%d,%d elapsed %lu\n",
-//     FunctorBase<DeviceType>::GetBlockIdZ(),
-//     // FunctorBase<DeviceType>::GetBlockIdY(),
-//     FunctorBase<DeviceType>::GetBlockIdX(), start); start = clock64();
-
-//     if (dwrf && threadId >= (R/2) * (C/2) * (F/2) * 5 && threadId < (R/2) *
-//     (C/2) * (F/2) * 6) {
-//       r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 5) / ((C/2) * (F/2))) * 2 +
-//       1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 5) % ((C/2) * (F/2))) /
-//       (F/2)) * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 5) % ((C/2) *
-//       (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) *
-//       (F/2) * 5) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2)
-//       * (F/2) * 5) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId -
-//       (R/2) * (C/2) * (F/2) * 5) % ((C/2) * (F/2))) % (F/2); res =
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm -
-//               1)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm +
-//                           1)], ratio_f_sm[f_sm - 1]);
-//               T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm -
-//               1)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm +
-//                           1)], ratio_f_sm[f_sm - 1]);
-//               res = lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//             }
-//             if (INTERPOLATION && CALC_COEFF) {
-//               res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//             }
-//             if (!INTERPOLATION && CALC_COEFF) {
-//               res -= dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl = 2 * f_gl + 1;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                          ratio_r_sm[r_sm - 1]);
-//             }
-//             if (CALC_COEFF) { // no need to test if in_next
-//               res -= dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-//     }
-
-//     if (dwrc && threadId >= (R/2) * (C/2) * (F/2) * 6 && threadId < (R/2) *
-//     (C/2) * (F/2) * 7) {
-//       r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 6) / ((C/2) * (F/2))) * 2 +
-//       1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 6) % ((C/2) * (F/2))) /
-//       (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 6) %
-//       ((C/2) * (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2)
-//       * (F/2) * 6) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) *
-//       (C/2) * (F/2) * 6) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 +
-//       ((threadId - (R/2) * (C/2) * (F/2) * 6) % ((C/2) * (F/2))) % (F/2); res
-//       = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//               f_sm)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                           f_sm)], ratio_c_sm[c_sm - 1]);
-//               T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//               f_sm)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                           f_sm)], ratio_c_sm[c_sm - 1]);
-//               res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//             }
-//             if (INTERPOLATION && CALC_COEFF) {
-//               res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//             }
-//             if (!INTERPOLATION && CALC_COEFF) {
-//               res -= dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl *= 2;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//               f_sm)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                           f_sm)], ratio_c_sm[c_sm - 1]);
-//               T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//               f_sm)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                           f_sm)], ratio_c_sm[c_sm - 1]);
-//               res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//             }
-//             if (CALC_COEFF) { // no need to test if in_next
-//               res -= dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-//     }
-
-//     if (dwrcf && threadId >= (R/2) * (C/2) * (F/2) * 7 && threadId < (R/2) *
-//     (C/2) * (F/2) * 8) {
-//       r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 7) / ((C/2) * (F/2))) * 2 +
-//       1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) /
-//       (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 7) %
-//       ((C/2) * (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) *
-//       (C/2) * (F/2) * 7) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2)
-//       * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 +
-//       ((threadId - (R/2) * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) % (F/2); res
-//       = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) {
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               T f1 = lerp(
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//               T f2 = lerp(
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//               T f3 = lerp(
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//               T f4 = lerp(
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-
-//               T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//               T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]);
-
-//               res = lerp(fc1, fc2, ratio_r_sm[r_sm - 1]);
-//             }
-//             if (INTERPOLATION && CALC_COEFF) {
-//               res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//             }
-//             if (!INTERPOLATION && CALC_COEFF) {
-//               res -= dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       } else if (TYPE == 2) {
-//         f_gl = 2 * f_gl + 1;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//           if (!skip) {
-//             if (INTERPOLATION) {
-//               T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//               f_sm)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                           f_sm)], ratio_c_sm[c_sm - 1]);
-//               T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//               f_sm)],
-//                           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                           f_sm)], ratio_c_sm[c_sm - 1]);
-//               res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//             }
-//             if (CALC_COEFF) { // no need to test if in_next
-//               res -= dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)];
-//             }
-//           }
-//           dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-//     }
-//     // end = clock64();
-
-//     // asm volatile("membar.cta;");
-//     // if (threadId < 256 && FunctorBase<DeviceType>::GetBlockIdZ() == 0 &&
-//     FunctorBase<DeviceType>::GetBlockIdY() == 0 &&
-//     FunctorBase<DeviceType>::GetBlockIdX() ==
-//     // 0) printf("threadId %d elapsed %lu\n", threadId, end-start);
-//     if (r + (R/2) * 2 == nr_p - 1) {
-//       // printf("test\n");
-//       if (threadId < (C/2) * (F/2)) {
-//         // printf("test1\n");
-//         if (dwf) {
-//           // printf("test2\n");
-//           r_sm = (R/2) * 2;
-//           c_sm = (threadId / (F/2)) * 2;
-//           f_sm = (threadId % (F/2)) * 2 + 1;
-//           r_gl = r / 2 + (R/2);
-//           c_gl = c / 2 + threadId / (F/2);
-//           f_gl = f / 2 + threadId % (F/2);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//           if (TYPE == 1) {
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//               // printf("test3\n");
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm -
-//                   1)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm +
-//                              1)], ratio_f_sm[f_sm - 1]);
-//                 }
-//                 if (INTERPOLATION && CALC_COEFF) {
-//                   res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//                 }
-//                 if (!INTERPOLATION && CALC_COEFF) {
-//                   res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               // printf("dwf (%d %d %d): %f\n", r_gl, c_gl, f_gl, res);
-//               dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           } else if (TYPE == 2) {
-//             f_gl = 2 * f_gl + 1;
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   ;
-//                 }
-//                 if (CALC_COEFF) { // need to test if in_next
-//                   if (in_next && f_gl < nf_c) {
-//                     ;
-//                   } // in_next
-//                   else {
-//                     res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//                   }
-//                 }
-//               }
-//               dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           }
-//         }
-
-//         if (dwc) {
-//           r_sm = (R/2) * 2;
-//           c_sm = (threadId / (F/2)) * 2 + 1;
-//           f_sm = (threadId % (F/2)) * 2;
-//           r_gl = r / 2 + (R/2);
-//           c_gl = c / 2 + threadId / (F/2);
-//           f_gl = f / 2 + threadId % (F/2);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//           if (TYPE == 1) {
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1,
-//                              f_sm)], ratio_c_sm[c_sm - 1]);
-//                 }
-//                 if (INTERPOLATION && CALC_COEFF) {
-//                   res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//                 }
-//                 if (!INTERPOLATION && CALC_COEFF) {
-//                   res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           } else if (TYPE == 2) {
-//             f_gl *= 2;
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1,
-//                              f_sm)], ratio_c_sm[c_sm - 1]);
-//                 }
-//                 if (CALC_COEFF) { // no need to test if in_next
-//                   res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           }
-//         }
-
-//         // printf("(%d %d %d) (%d %d %d) %f\n",
-//         //         r_sm, c_sm, f_sm, r_gl, c_gl, f_gl, v_sm[get_idx(ldsm1,
-//         //         ldsm2, r_sm, c_sm, f_sm)]);
-//         if (dwcf) {
-//           r_sm = (R/2) * 2;
-//           c_sm = (threadId / (F/2)) * 2 + 1;
-//           f_sm = (threadId % (F/2)) * 2 + 1;
-//           r_gl = r / 2 + (R/2);
-//           c_gl = c / 2 + threadId / (F/2);
-//           f_gl = f / 2 + threadId % (F/2);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//           if (TYPE == 1) {
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   T f1 = lerp(
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)],
-//                       ratio_f_sm[f_sm - 1]);
-//                   T f2 = lerp(
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)],
-//                       ratio_f_sm[f_sm - 1]);
-//                   res = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//                 }
-//                 if (INTERPOLATION && CALC_COEFF) {
-//                   res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//                 }
-//                 if (!INTERPOLATION && CALC_COEFF) {
-//                   res -= dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           } else if (TYPE == 2) {
-//             f_gl = 2 * f_gl + 1;
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1,
-//                              f_sm)], ratio_c_sm[c_sm - 1]);
-//                 }
-//                 if (CALC_COEFF) {
-//                   res -= dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           }
-//         }
-//       }
-//     }
-
-//     if (c + (C/2) * 2 == nc_p - 1) {
-//       if (threadId >= (R/2) * (C/2) * (F/2) && threadId < (R/2) * (C/2) *
-//       (F/2) + (R/2) * (F/2)) {
-//         if (dwf) {
-//           r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2;
-//           c_sm = (C/2) * 2;
-//           f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2 + 1;
-//           r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2);
-//           c_gl = c / 2 + (C/2);
-//           f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//           if (TYPE == 1) {
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm -
-//                   1)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm +
-//                              1)], ratio_f_sm[f_sm - 1]);
-//                 }
-//                 if (INTERPOLATION && CALC_COEFF) {
-//                   res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//                 }
-//                 if (!INTERPOLATION && CALC_COEFF) {
-//                   res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           } else if (TYPE == 2) {
-//             f_gl = 2 * f_gl + 1;
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   ;
-//                 }
-//                 if (CALC_COEFF) { // need to test if in_next
-//                   if (in_next && f_gl < nf_c) {
-//                     ;
-//                   } // in_next
-//                   else {
-//                     res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//                   }
-//                 }
-//               }
-//               dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           }
-//         }
-
-//         if (dwr) {
-//           r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2 + 1;
-//           c_sm = (C/2) * 2;
-//           f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2;
-//           r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2);
-//           c_gl = c / 2 + (C/2);
-//           f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//           if (TYPE == 1) {
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm,
-//                              f_sm)], ratio_r_sm[r_sm - 1]);
-//                 }
-//                 if (INTERPOLATION && CALC_COEFF) {
-//                   res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//                 }
-//                 if (!INTERPOLATION && CALC_COEFF) {
-//                   res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           } else if (TYPE == 2) {
-//             f_gl *= 2;
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm,
-//                              f_sm)], ratio_r_sm[r_sm - 1]);
-//                 }
-//                 if (CALC_COEFF) {
-//                   res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           }
-//         }
-
-//         if (dwrf) {
-//           r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2 + 1;
-//           c_sm = (C/2) * 2;
-//           f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2 + 1;
-//           r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2);
-//           c_gl = c / 2 + (C/2);
-//           f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//           if (TYPE == 1) {
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   T f1 = lerp(
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)],
-//                       ratio_f_sm[f_sm - 1]);
-//                   T f2 = lerp(
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)],
-//                       ratio_f_sm[f_sm - 1]);
-//                   res = lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//                 }
-//                 if (INTERPOLATION && CALC_COEFF) {
-//                   res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//                 }
-//                 if (!INTERPOLATION && CALC_COEFF) {
-//                   res -= dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           } else if (TYPE == 2) {
-//             f_gl = 2 * f_gl + 1;
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm,
-//                              f_sm)], ratio_r_sm[r_sm - 1]);
-//                 }
-//                 if (CALC_COEFF) { // no need to test if in_next
-//                   res -= dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           }
-//         }
-//       }
-//     }
-
-//     if (f + (F/2) * 2 == nf_p - 1) {
-//       if (threadId >= (R/2) * (C/2) * (F/2) * 2 && threadId < (R/2) * (C/2) *
-//       (F/2) * 2 + (R/2) * (C/2)) {
-//         if (dwc) {
-//           r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2;
-//           c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2 + 1;
-//           f_sm = (F/2) * 2;
-//           r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2);
-//           c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2);
-//           f_gl = f / 2 + (F/2);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//           if (TYPE == 1) {
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1,
-//                              f_sm)], ratio_c_sm[c_sm - 1]);
-//                 }
-//                 if (INTERPOLATION && CALC_COEFF) {
-//                   res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//                 }
-//                 if (!INTERPOLATION && CALC_COEFF) {
-//                   res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           } else if (TYPE == 2) {
-//             f_gl *= 2;
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1,
-//                              f_sm)], ratio_c_sm[c_sm - 1]);
-//                 }
-//                 if (CALC_COEFF) {
-//                   res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           }
-//         }
-
-//         if (dwr) {
-//           r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2 + 1;
-//           c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2;
-//           f_sm = (F/2) * 2;
-//           r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2);
-//           c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2);
-//           f_gl = f / 2 + (F/2);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//           if (TYPE == 1) {
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm,
-//                              f_sm)], ratio_r_sm[r_sm - 1]);
-//                 }
-//                 if (INTERPOLATION && CALC_COEFF) {
-//                   res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//                 }
-//                 if (!INTERPOLATION && CALC_COEFF) {
-//                   res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           } else if (TYPE == 2) {
-//             f_gl *= 2;
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//                   f_sm)],
-//                              v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm,
-//                              f_sm)], ratio_r_sm[r_sm - 1]);
-//                 }
-//                 if (CALC_COEFF) {
-//                   res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           }
-//         }
-
-//         if (dwrc) {
-//           r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2 + 1;
-//           c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2 + 1;
-//           f_sm = (F/2) * 2;
-//           r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2);
-//           c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2);
-//           f_gl = f / 2 + (F/2);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//           if (TYPE == 1) {
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   T c1 = lerp(
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)],
-//                       ratio_c_sm[c_sm - 1]);
-//                   T c2 = lerp(
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)],
-//                       ratio_c_sm[c_sm - 1]);
-//                   res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//                 }
-//                 if (INTERPOLATION && CALC_COEFF) {
-//                   res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//                 }
-//                 if (!INTERPOLATION && CALC_COEFF) {
-//                   res -= dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           } else if (TYPE == 2) {
-//             f_gl *= 2;
-//             if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//                 r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//               if (!skip) {
-//                 if (INTERPOLATION) {
-//                   T c1 = lerp(
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)],
-//                       ratio_c_sm[c_sm - 1]);
-//                   T c2 = lerp(
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)],
-//                       ratio_c_sm[c_sm - 1]);
-//                   res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//                 }
-//                 if (CALC_COEFF) {
-//                   res -= dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//               dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res;
-//             }
-//           }
-//         }
-//       }
-//     }
-
-//     if (dwr && c + (C/2) * 2 == nc_p - 1 && f + (F/2) * 2 == nf_p - 1) {
-//       if (threadId >= (R/2) * (C/2) * (F/2) * 3 && threadId < (R/2) * (C/2) *
-//       (F/2) * 3 + (R/2)) {
-//         r_sm = (threadId - (R/2) * (C/2) * (F/2) * 3) * 2 + 1;
-//         c_sm = (C/2) * 2;
-//         f_sm = (F/2) * 2;
-//         r_gl = r / 2 + threadId - (R/2) * (C/2) * (F/2) * 3;
-//         c_gl = c / 2 + (C/2);
-//         f_gl = f / 2 + (F/2);
-//         res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                            ratio_r_sm[r_sm - 1]);
-//               }
-//               if (INTERPOLATION && CALC_COEFF) {
-//                 res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//               }
-//               if (!INTERPOLATION && CALC_COEFF) {
-//                 res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//             dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl *= 2;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                            ratio_r_sm[r_sm - 1]);
-//               }
-//               if (CALC_COEFF) {
-//                 res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//             dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//           }
-//         }
-//       }
-//     }
-
-//     if (dwc && r + (R/2) * 2 == nr_p - 1 && f + (F/2) * 2 == nf_p - 1) {
-//       if (threadId >= (R/2) * (C/2) * (F/2) * 4 && threadId < (R/2) * (C/2) *
-//       (F/2) * 4 + (C/2)) {
-//         r_sm = (R/2) * 2;
-//         c_sm = (threadId - (R/2) * (C/2) * (F/2) * 4) * 2 + 1;
-//         f_sm = (F/2) * 2;
-//         r_gl = r / 2 + (R/2);
-//         c_gl = c / 2 + threadId - (R/2) * (C/2) * (F/2) * 4;
-//         f_gl = f / 2 + (F/2);
-//         res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                            ratio_c_sm[c_sm - 1]);
-//               }
-//               if (INTERPOLATION && CALC_COEFF) {
-//                 res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//               }
-//               if (!INTERPOLATION && CALC_COEFF) {
-//                 res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//             dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl *= 2;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                            ratio_c_sm[c_sm - 1]);
-//               }
-//               if (CALC_COEFF) {
-//                 res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//             dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//           }
-//         }
-//       }
-//     }
-
-//     // printf("test1\n");
-//     if (dwf && r + (R/2) * 2 == nr_p - 1 && c + (C/2) * 2 == nc_p - 1) {
-//       // printf("test2\n");
-//       if (threadId >= (R/2) * (C/2) * (F/2) * 5 && threadId < (R/2) * (C/2) *
-//       (F/2) * 5 + (F/2)) {
-//         // printf("test3\n");
-//         r_sm = (R/2) * 2;
-//         c_sm = (C/2) * 2;
-//         f_sm = (threadId - (R/2) * (C/2) * (F/2) * 5) * 2 + 1;
-//         r_gl = r / 2 + (R/2);
-//         c_gl = c / 2 + (C/2);
-//         f_gl = f / 2 + threadId - (R/2) * (C/2) * (F/2) * 5;
-//         res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//             // printf("test4\n");
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                            ratio_f_sm[f_sm - 1]);
-//               }
-//               if (INTERPOLATION && CALC_COEFF) {
-//                 res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//               }
-//               if (!INTERPOLATION && CALC_COEFF) {
-//                 res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//               }
-//             }
-//             dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//             // printf("dwf(%d %d %d): %f\n", r_gl, c_gl, f_gl,
-//             // dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]);
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl = 2 * f_gl + 1;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 ;
-//               }
-//               if (CALC_COEFF) { // do need to test in_next
-//                 if (in_next && f_gl < nf_c) {
-//                   ;
-//                 } // in_next
-//                 else {
-//                   res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//                 }
-//               }
-//             }
-//             dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//           }
-//         }
-//       }
-//     }
-
-//   } // skip
-
-//   // if (r == 0 && c == 0 && f == 0 && threadId == 0) {
-//   //   printf("out config: %d %d %d (%d %d %d)\n", (R/2), (C/2), (F/2),
-//   r,c,f);
-//   //   for (int i = 0; i < (R/2) * 2 + 1; i++) {
-//   //     for (int j = 0; j < (C/2) * 2 + 1; j++) {
-//   //       for (int k = 0; k < (F/2) * 2 + 1; k++) {
-//   //         printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   // }
-// }
-
-// template <DIM D_GLOBAL, DIM D_LOCAL, typename T, SIZE R, SIZE C, SIZE F,
-//           bool INTERPOLATION, bool CALC_COEFF, int TYPE>
-// void gpk_reo_adaptive_launcher(
-//     Handle<D_GLOBAL, T> &handle, SIZE *shape_h, SIZE *shape_d, SIZE
-//     *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n, DIM
-//     *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T
-//     *dratio_r, T *dratio_c, T *dratio_f, T *dv, LENGTH lddv1, LENGTH lddv2, T
-//     *dw, LENGTH lddw1, LENGTH lddw2, T *dwf, LENGTH lddwf1, LENGTH lddwf2, T
-//     *dwc, LENGTH lddwc1, LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2,
-//     T *dwcf, LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH
-//     lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, LENGTH
-//     lddwrcf1, LENGTH lddwrcf2, int queue_idx) {
-
-//   SIZE nr = shape_h[curr_dim_r];
-//   SIZE nc = shape_h[curr_dim_c];
-//   SIZE nf = shape_h[curr_dim_f];
-//   if (D_LOCAL == 2) {
-//     nr = 1;
-//   }
-//   SIZE total_thread_z = std::max(nr - 1, (SIZE)1);
-//   SIZE total_thread_y = std::max(nc - 1, (SIZE)1);
-//   SIZE total_thread_x = std::max(nf - 1, (SIZE)1);
-
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   SIZE sm_size;
-//   // const int R = 4;
-//   // const int C = 4;
-//   // const int F = 16;
-//   // tbz = std::min(R, total_thread_z);
-//   // tby = std::min(C, total_thread_y);
-//   // tbx = std::min(F, total_thread_x);
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = ((R + 1) * (C + 1) * (F + 1) + R + C + F) * sizeof(T);
-//   sm_size += (D_GLOBAL * 4) * sizeof(SIZE);
-//   sm_size += (D_GLOBAL * 1) * sizeof(DIM);
-
-//   // printf("sm_size: %llu\n", sm_size);
-//   //   printf("RCF: %u %u %u\n", R, C, F);
-
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   for (DIM d = 0; d < D_GLOBAL; d++) {
-//     if (D_LOCAL == 3 && d != curr_dim_f && d != curr_dim_c && d !=
-//     curr_dim_r) {
-//       gridx *= shape_h[d];
-//     }
-//     if (D_LOCAL == 2 && d != curr_dim_f && d != curr_dim_c) {
-//       gridx *= shape_h[d];
-//     }
-//   }
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-
-//   // printf("_gpk_reo exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz,
-//   // gridx, gridy, gridz);
-
-//   // high_resolution_clock::time_point t1 = high_resolution_clock::now();
-//   _gpk_reo<D_GLOBAL, D_LOCAL, T, R, C, F, INTERPOLATION, CALC_COEFF,
-//            TYPE><<<blockPerGrid, threadsPerBlock, sm_size,
-//                    *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       shape_d, shape_c_d, ldvs, ldws, unprocessed_n, unprocessed_dims,
-//       curr_dim_r, curr_dim_c, curr_dim_f, dratio_r, dratio_c, dratio_f, dv,
-//       lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1,
-//       lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1,
-//       lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2);
-
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D_GLOBAL, DIM D_LOCAL, typename T, bool INTERPOLATION,
-//           bool CALC_COEFF, int TYPE>
-// void gpk_reo(Handle<D_GLOBAL, T> &handle, SIZE *shape_h, SIZE *shape_d,
-//              SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n,
-//              DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c,
-//              DIM curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv,
-//              LENGTH lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2, T
-//              *dwf, LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1,
-//              LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf,
-//              LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH
-//              lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf,
-//              LENGTH lddwrcf1, LENGTH lddwrcf2, int queue_idx, int config) {
-
-// #define GPK(R, C, F)                                                           \
-//   {                                                                            \
-//     gpk_reo_adaptive_launcher<D_GLOBAL, D_LOCAL, T, R, C, F, INTERPOLATION,    \
-//                               CALC_COEFF, TYPE>(                               \
-//         handle, shape_h, shape_d, shape_c_d, ldvs, ldws, unprocessed_n,        \
-//         unprocessed_dims, curr_dim_r, curr_dim_c, curr_dim_f, dratio_r,        \
-//         dratio_c, dratio_f, dv, lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1,   \
-//         lddwf2, dwc, lddwc1, lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1,       \
-//         lddwcf2, dwrf, lddwrf1, lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf,        \
-//         lddwrcf1, lddwrcf2, queue_idx);                                        \
-//   }
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D_LOCAL == 3) {
-//     if (profile || config == 6) {
-//       GPK(2, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(2, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(4, 4, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(4, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(4, 4, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       GPK(2, 2, 2)
-//     }
-//     // GPK(T, 4, 4, 4)
-//   } else if (D_LOCAL == 2) {
-//     if (profile || config == 6) {
-//       GPK(1, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(1, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(1, 4, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(1, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(1, 4, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(1, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       GPK(1, 2, 2)
-//     }
-//     // GPK(T, 1, 4, 4)
-//   } else if (D_LOCAL == 1) {
-//     if (profile || config == 6) {
-//       GPK(1, 1, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(1, 1, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(1, 1, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(1, 1, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(1, 1, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(1, 1, 4)
-//     }
-//     if (profile || config == 0) {
-//       GPK(1, 1, 2)
-//     }
-//   }
-// #undef GPK
-// }
-
-// template <DIM D_GLOBAL, DIM D_LOCAL, typename T, SIZE R, SIZE C, SIZE F,
-//           bool INTERPOLATION, bool COEFF_RESTORE, int TYPE>
-// __global__ void
-// _gpk_rev(SIZE *shape, SIZE *shape_c,
-//           SIZE *ldvs, SIZE *ldws,
-//           DIM unprocessed_n, DIM *unprocessed_dims,
-//           DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f,
-//          T *dratio_r, T *dratio_c, T *dratio_f,
-//           T *dv, LENGTH lddv1, LENGTH lddv2,
-//          T *dw, LENGTH lddw1, LENGTH lddw2, T *dwf, LENGTH lddwf1, LENGTH
-//          lddwf2, T *dwc, LENGTH lddwc1, LENGTH lddwc2, T *dwr, LENGTH lddwr1,
-//          LENGTH lddwr2, T *dwcf, LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf,
-//          LENGTH lddwrf1, LENGTH lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH
-//          lddwrc2, T *dwrcf, LENGTH lddwrcf1, LENGTH lddwrcf2, SIZE svr, SIZE
-//          svc, SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf) {
-
-//   // bool debug = false;
-//   // if (FunctorBase<DeviceType>::GetBlockIdX() == 0 &&
-//   FunctorBase<DeviceType>::GetBlockIdY() == 0 &&
-//   FunctorBase<DeviceType>::GetBlockIdZ() == 0 &&
-//   //     threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
-//   //   debug = false;
-
-//   // bool debug2 = false;
-//   // if (FunctorBase<DeviceType>::GetBlockIdX() == 0 &&
-//   FunctorBase<DeviceType>::GetBlockIdY() == 0 &&
-//   FunctorBase<DeviceType>::GetBlockIdZ() == 0)
-//   //   debug2 = false;
-
-//   LENGTH threadId = (threadIdx.z * (FunctorBase<DeviceType>::GetBlockDimX() *
-//   FunctorBase<DeviceType>::GetBlockDimY())) +
-//                     (threadIdx.y * FunctorBase<DeviceType>::GetBlockDimX()) +
-//                     threadIdx.x;
-
-//   SIZE nr, nc, nf;
-//   SIZE nr_c, nc_c, nf_c;
-//   SIZE r, c, f;
-//   SIZE rest_r, rest_c, rest_f;
-//   SIZE nr_p, nc_p, nf_p;
-//   SIZE rest_r_p, rest_c_p, rest_f_p;
-//   SIZE r_sm, c_sm, f_sm;
-//   SIZE r_sm_ex, c_sm_ex, f_sm_ex;
-//   SIZE r_gl, c_gl, f_gl;
-//   SIZE r_gl_ex, c_gl_ex, f_gl_ex;
-//   T res;
-//   bool in_next = true;
-
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = (F/2) * 2 + 1;
-//   SIZE ldsm2 = (C/2) * 2 + 1;
-
-//   T *v_sm = sm; sm += ((F/2) * 2 + 1) * ((C/2) * 2 + 1) * ((R/2) * 2 + 1);
-//   T *ratio_f_sm = sm; sm += (F/2) * 2;
-//   T *ratio_c_sm = sm; sm += (C/2) * 2;
-//   T *ratio_r_sm = sm; sm += (R/2) * 2;
-
-//   SIZE * sm_size = (SIZE*)sm;
-//   SIZE *shape_sm = sm_size; sm_size += D_GLOBAL;
-//   SIZE *shape_c_sm = sm_size; sm_size += D_GLOBAL;
-//   SIZE *ldvs_sm = sm_size; sm_size += D_GLOBAL;
-//   SIZE *ldws_sm = sm_size; sm_size += D_GLOBAL;
-//   sm = (T*)sm_size;
-
-//   DIM * sm_dim = (DIM*)sm;
-//   DIM *unprocessed_dims_sm = sm_dim; sm_dim += D_GLOBAL;
-//   sm = (T*)sm_dim;
-
-//   SIZE idx[D_GLOBAL];
-//   if (threadId < D_GLOBAL) {
-//     shape_sm[threadId] = shape[threadId];
-//     shape_c_sm[threadId] = shape_c[threadId];
-//     ldvs_sm[threadId] = ldvs[threadId];
-//     ldws_sm[threadId] = ldws[threadId];
-//   }
-
-//   if (threadId < unprocessed_n) {
-//     unprocessed_dims_sm[threadId] = unprocessed_dims[threadId];
-//   }
-//   __syncthreads();
-//   for (DIM d = 0; d < D_GLOBAL; d++)
-//     idx[d] = 0;
-
-//   nr = shape_sm[curr_dim_r];
-//   nc = shape_sm[curr_dim_c];
-//   nf = shape_sm[curr_dim_f];
-
-//   nr_c = shape_c_sm[curr_dim_r];
-//   nc_c = shape_c_sm[curr_dim_c];
-//   nf_c = shape_c_sm[curr_dim_f];
-
-//   if (D_LOCAL < 3) {
-//     nr = 1;
-//     nr_c = 1;
-//   }
-//   if (D_LOCAL < 2) {
-//     nc = 1;
-//     nc_c = 1;
-//   }
-
-//   r = FunctorBase<DeviceType>::GetBlockIdZ() *
-//   FunctorBase<DeviceType>::GetBlockDimZ(); c =
-//   FunctorBase<DeviceType>::GetBlockIdY() *
-//   FunctorBase<DeviceType>::GetBlockDimY(); SIZE bidx =
-//   FunctorBase<DeviceType>::GetBlockIdX(); SIZE firstD =
-//   div_roundup(shape_sm[0] - 1, FunctorBase<DeviceType>::GetBlockDimX()); f =
-//   (bidx % firstD) * FunctorBase<DeviceType>::GetBlockDimX();
-
-//   bidx /= firstD;
-
-//   rest_r = nr - r;
-//   rest_c = nc - c;
-//   rest_f = nf - f;
-
-//   nr_p = nr;
-//   nc_p = nc;
-//   nf_p = nf;
-
-//   rest_r_p = rest_r;
-//   rest_c_p = rest_c;
-//   rest_f_p = rest_f;
-
-//   if (nr % 2 == 0) {
-//     nr_p = nr + 1;
-//     rest_r_p = nr_p - r;
-//   }
-//   if (nc % 2 == 0) {
-//     nc_p = nc + 1;
-//     rest_c_p = nc_p - c;
-//   }
-//   if (nf % 2 == 0) {
-//     nf_p = nf + 1;
-//     rest_f_p = nf_p - f;
-//   }
-
-//   for (int d = 0; d < D_GLOBAL; d++) {
-//     if (D_LOCAL == 3 && d != curr_dim_r && d != curr_dim_c && d !=
-//     curr_dim_f) {
-//       idx[d] = bidx % shape_sm[d];
-//       bidx /= shape_sm[d];
-//       if ((shape_sm[d] % 2 == 1 && idx[d] % 2 != 0) ||
-//           shape_sm[d] % 2 == 0 &&
-//               (idx[d] % 2 != 0 && idx[d] != shape_sm[d] - 1))
-//         in_next = false;
-//     }
-//     if (D_LOCAL == 2 && d != curr_dim_c && d != curr_dim_f) {
-//       idx[d] = bidx % shape_sm[d];
-//       bidx /= shape_sm[d];
-//       if ((shape_sm[d] % 2 == 1 && idx[d] % 2 != 0) ||
-//           shape_sm[d] % 2 == 0 &&
-//               (idx[d] % 2 != 0 && idx[d] != shape_sm[d] - 1))
-//         in_next = false;
-//     }
-//   }
-
-//   int skip = 0;
-//   #pragma unroll 1
-//   for (DIM t = 0; t < D_GLOBAL; t++) {
-//     for (DIM k = 0; k < unprocessed_n; k++) {
-//       if (t == unprocessed_dims_sm[k] && idx[t] >= shape_c_sm[t]) {
-//         skip = 1;
-//       }
-//     }
-//   }
-
-//   // if (FunctorBase<DeviceType>::GetBlockIdX() == 0 &&
-//   FunctorBase<DeviceType>::GetBlockIdY() == 0 &&
-//   FunctorBase<DeviceType>::GetBlockIdZ() == 0) {
-//   // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
-//   //   printf("TYPE %d total_idx_sm: %d %d %d %d (skip: %d)\n", TYPE, idx[3],
-//   //   idx[2], idx[1], idx[0], skip);
-//   // }
-//   // }
-
-//   LENGTH other_offset_v = get_idx<D_GLOBAL>(ldvs_sm, idx);
-//   LENGTH other_offset_w = get_idx<D_GLOBAL>(ldws_sm, idx);
-
-//   dv = dv + other_offset_v;
-//   dw = dw + other_offset_w;
-//   dwr = dwr + other_offset_w;
-//   dwc = dwc + other_offset_w;
-//   dwf = dwf + other_offset_w;
-//   dwrf = dwrf + other_offset_w;
-//   dwrc = dwrc + other_offset_w;
-//   dwcf = dwcf + other_offset_w;
-//   dwrcf = dwrcf + other_offset_w;
-
-//   if (TYPE == 2) {
-//     dwf = dw;
-//     dwcf = dwc;
-//     dwrf = dwr;
-//     dwrcf = dwrc;
-//   }
-//   __syncthreads();
-
-//   r_sm = threadIdx.z;
-//   c_sm = threadIdx.y;
-//   f_sm = threadIdx.x;
-
-//   r_sm_ex = (R/2) * 2;
-//   c_sm_ex = (C/2) * 2;
-//   f_sm_ex = (F/2) * 2;
-
-//   r_gl = r + r_sm;
-//   r_gl_ex = r + (R/2) * 2;
-//   c_gl = c + c_sm;
-//   c_gl_ex = c + (C/2) * 2;
-//   f_gl = f + f_sm;
-//   f_gl_ex = f + (F/2) * 2;
-
-//   // load dist
-//   if (c_sm == 0 && f_sm == 0 && r_sm < rest_r - 2) {
-//     ratio_r_sm[r_sm] = dratio_r[r + r_sm];
-//     if (nr % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p && r_sm == 0) {
-//       ratio_r_sm[rest_r_p - 3] = 0.5;
-//     }
-//   }
-//   if (r_sm == 0 && f_sm == 0 && c_sm < rest_c - 2) {
-//     ratio_c_sm[c_sm] = dratio_c[c + c_sm];
-//     if (nc % 2 == 0 && (C/2) * 2 + 1 >= rest_c_p && c_sm == 0) {
-//       ratio_c_sm[rest_c_p - 3] = 0.5;
-//     }
-//   }
-//   if (c_sm == 0 && r_sm == 0 && f_sm < rest_f - 2) {
-//     ratio_f_sm[f_sm] = dratio_f[f + f_sm];
-//     if (nf % 2 == 0 && (F/2) * 2 + 1 >= rest_f_p && f_sm == 0) {
-//       ratio_f_sm[rest_f_p - 3] = 0.5;
-//     }
-//   }
-
-//   if (threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.x == 0) {
-//     for (int i = 0; i < (R/2) * 2 + 1; i++) {
-//       for (int j = 0; j < (C/2) * 2 + 1; j++) {
-//         for (int k = 0; k < (F/2) * 2 + 1; k++) {
-//           v_sm[get_idx(ldsm1, ldsm2, i, j, k)] = 0.0;
-//         }
-//       }
-//     }
-//   }
-
-//   __syncthreads();
-
-//   if (dw && threadId < (R/2) * (C/2) * (F/2)) {
-//     r_sm = (threadId / ((C/2) * (F/2))) * 2;
-//     c_sm = ((threadId % ((C/2) * (F/2))) / (F/2)) * 2;
-//     f_sm = ((threadId % ((C/2) * (F/2))) % (F/2)) * 2;
-//     r_gl = r / 2 + threadId / ((C/2) * (F/2));
-//     c_gl = c / 2 + threadId % ((C/2) * (F/2)) / (F/2);
-//     f_gl = f / 2 + threadId % ((C/2) * (F/2)) % (F/2);
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//         } else {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//               dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         }
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (debug2)
-//         //   printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm,
-//         //          dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-
-//       f_gl += 1;
-//       f_sm += 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (debug2)
-//         //   printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm,
-//         //          dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   int base = 0;
-//   if (dw && threadId >= base && threadId < base + (C/2) * (F/2)) {
-//     r_sm = (R/2) * 2;
-//     c_sm = ((threadId - base) / (F/2)) * 2;
-//     f_sm = ((threadId - base) % (F/2)) * 2;
-//     r_gl = r / 2 + (R/2);
-//     c_gl = c / 2 + (threadId - base) / (F/2);
-//     f_gl = f / 2 + (threadId - base) % (F/2);
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//         } else {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//               dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         }
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             // printf("nf: %d, f_gl: %d, in_next: %d, f_in_next: %d\n", nf,
-//             // f_gl, in_next, f_in_next);
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-
-//       f_gl += 1;
-//       f_sm += 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (debug2)
-//         //   printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm,
-//         //          dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             // printf("nf: %d, f_gl: %d, in_next: %d, f_in_next: %d\n", nf,
-//             // f_gl, in_next, f_in_next);
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-//   base += (C/2) * (F/2); // ROUND_UP_WARP((C/2) * (F/2)) * WARP_SIZE;
-//   if (dw && threadId >= base && threadId < base + (R/2) * (F/2)) {
-//     r_sm = ((threadId - base) / (F/2)) * 2;
-//     c_sm = (C/2) * 2;
-//     f_sm = ((threadId - base) % (F/2)) * 2;
-//     r_gl = r / 2 + (threadId - base) / (F/2);
-//     c_gl = c / 2 + (C/2);
-//     f_gl = f / 2 + (threadId - base) % (F/2);
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//         } else {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//               dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         }
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-
-//       f_gl += 1;
-//       f_sm += 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (debug2)
-//         //   printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm,
-//         //          dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-//   base += (R/2) * (F/2); // ROUND_UP_WARP((R/2) * (F/2)) * WARP_SIZE;
-//   if (dw && threadId >= base && threadId < base + (R/2) * (C/2)) {
-//     r_sm = ((threadId - base) / (C/2)) * 2;
-//     c_sm = ((threadId - base) % (C/2)) * 2;
-//     f_sm = (F/2) * 2;
-//     r_gl = r / 2 + (threadId - base) / (C/2);
-//     c_gl = c / 2 + (threadId - base) % (C/2);
-//     f_gl = f / 2 + (F/2);
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//         } else {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//               dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         }
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-
-//       f_gl += 1;
-//       f_sm += 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (debug2)
-//         //   printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm,
-//         //          dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-//   base += (R/2) * (C/2); // ROUND_UP_WARP((R/2) * (C/2)) * WARP_SIZE;
-//   // load extra edges
-//   if (dw && threadId >= base && threadId < base + (R/2)) {
-//     r_sm = (threadId - base) * 2;
-//     c_sm = (C/2) * 2;
-//     f_sm = (F/2) * 2;
-//     r_gl = r / 2 + threadId - base;
-//     c_gl = c / 2 + (C/2);
-//     f_gl = f / 2 + (F/2);
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//         } else {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//               dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         }
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-
-//       f_gl += 1;
-//       f_sm += 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (debug2)
-//         //   printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm,
-//         //          dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-//   base += (R/2); // ROUND_UP_WARP((R/2)) * WARP_SIZE;
-//   if (dw && threadId >= base && threadId < base + (C/2)) {
-//     r_sm = (R/2) * 2;
-//     c_sm = (threadId - base) * 2;
-//     f_sm = (F/2) * 2;
-//     r_gl = r / 2 + (R/2);
-//     c_gl = c / 2 + threadId - base;
-//     f_gl = f / 2 + (F/2);
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//         } else {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//               dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         }
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-
-//       f_gl += 1;
-//       f_sm += 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (debug2)
-//         //   printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm,
-//         //          dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-//   base += (C/2); // ROUND_UP_WARP((C/2)) * WARP_SIZE;
-//   if (dw && threadId >= base && threadId < base + (F/2)) {
-//     r_sm = (R/2) * 2;
-//     c_sm = (C/2) * 2;
-//     f_sm = (threadId - base) * 2;
-//     r_gl = r / 2 + (R/2);
-//     c_gl = c / 2 + (C/2);
-//     f_gl = f / 2 + threadId - base;
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//         } else {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//               dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         }
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-
-//       f_gl += 1;
-//       f_sm += 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (debug2)
-//         //   printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm,
-//         //          dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-//   base += (F/2); // ROUND_UP_WARP((F/2)) * WARP_SIZE;
-//   // // load extra vertex
-//   if (dw && threadId >= base && threadId < base + 1) {
-//     r_sm = (R/2) * 2;
-//     c_sm = (C/2) * 2;
-//     f_sm = (F/2) * 2;
-//     r_gl = r / 2 + (R/2);
-//     c_gl = c / 2 + (C/2);
-//     f_gl = f / 2 + (F/2);
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//         } else {
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//               dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         }
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-
-//       f_gl += 1;
-//       f_sm += 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (debug2)
-//         //   printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm,
-//         //          dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//           if (COEFF_RESTORE) {
-//             bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) ||
-//                              (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf -
-//                              1));
-//             if (in_next && f_in_next) {
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0;
-//             } else {
-//               ;
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   __syncthreads();
-
-//   // __syncthreads();
-//   // if (debug) {
-//   //   printf("TYPE: %d %d %d %d\n", TYPE, min(rest_r_p, (R/2) * 2 + 1),
-//   //          min(rest_c_p, (C/2) * 2 + 1), min(rest_f_p, (F/2) * 2 + 1));
-//   //   for (int i = 0; i < min(rest_r_p, (R/2) * 2 + 1); i++) {
-//   //     for (int j = 0; j < min(rest_c_p, (C/2) * 2 + 1); j++) {
-//   //       for (int k = 0; k < min(rest_f_p, (F/2) * 2 + 1); k++) {
-//   //         printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   // }
-//   // __syncthreads();
-
-//   if (dwf && threadId >= (R/2) * (C/2) * (F/2) && threadId < (R/2) * (C/2) *
-//   (F/2) * 2) {
-
-//     r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / ((C/2) * (F/2))) * 2;
-//     c_sm = (((threadId - (R/2) * (C/2) * (F/2)) % ((C/2) * (F/2))) / (F/2)) *
-//     2; f_sm = (((threadId - (R/2) * (C/2) * (F/2)) % ((C/2) * (F/2))) %
-//     (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) /
-//     ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2)) %
-//     ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) * (C/2) *
-//     (F/2)) % ((C/2) * (F/2))) % (F/2);
-
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-
-//         res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION && COEFF_RESTORE) { // fused
-//             res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                         ratio_f_sm[f_sm - 1]);
-//           } else if (INTERPOLATION && !COEFF_RESTORE) {
-//             res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                        v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                        ratio_f_sm[f_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     } else if (TYPE == 2) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//         f_gl = 2 * f_gl + 1;
-//         // res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             ;
-//           }
-//         }
-//         // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     }
-//   }
-
-//   if (dwc && threadId >= (R/2) * (C/2) * (F/2) * 2 && threadId < (R/2) *
-//   (C/2) * (F/2) * 3) {
-//     r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / ((C/2) * (F/2))) * 2;
-//     c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 2) % ((C/2) * (F/2))) /
-//     (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 2) % ((C/2)
-//     * (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)
-//     * 2) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2)
-//     * 2) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) *
-//     (C/2) * (F/2) * 2) % ((C/2) * (F/2))) % (F/2); if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//         res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION && COEFF_RESTORE) {
-//             res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                         ratio_c_sm[c_sm - 1]);
-//           } else if (INTERPOLATION && !COEFF_RESTORE) {
-//             res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                        v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                        ratio_c_sm[c_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//         res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                        v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                        ratio_c_sm[c_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     }
-//   }
-
-//   if (dwr && threadId >= (R/2) * (C/2) * (F/2) * 3 && threadId < (R/2) *
-//   (C/2) * (F/2) * 4) {
-//     r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 3) / ((C/2) * (F/2))) * 2 +
-//     1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 3) % ((C/2) * (F/2))) /
-//     (F/2)) * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 3) % ((C/2) *
-//     (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) *
-//     3) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2) *
-//     3) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) * (C/2)
-//     * (F/2) * 3) % ((C/2) * (F/2))) % (F/2);
-
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION && COEFF_RESTORE) {
-//             res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                         ratio_r_sm[r_sm - 1]);
-//           } else if (INTERPOLATION && !COEFF_RESTORE) {
-//             res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                        v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                        ratio_r_sm[r_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//         res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                        v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                        ratio_r_sm[r_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     }
-//   }
-
-//   if (dwcf && threadId >= (R/2) * (C/2) * (F/2) * 4 && threadId < (R/2) *
-//   (C/2) * (F/2) * 5) {
-//     r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 4) / ((C/2) * (F/2))) * 2;
-//     c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) /
-//     (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 4) % ((C/2)
-//     * (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) *
-//     (F/2) * 4) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) *
-//     (F/2) * 4) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2)
-//     * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) % (F/2);
-
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//         res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION && COEFF_RESTORE) {
-//             T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm -
-//             1)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm +
-//                         1)], ratio_f_sm[f_sm - 1]);
-//             T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm -
-//             1)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm +
-//                         1)], ratio_f_sm[f_sm - 1]);
-//             res += lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//           } else if (INTERPOLATION && !COEFF_RESTORE) {
-//             T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm -
-//             1)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm +
-//                         1)], ratio_f_sm[f_sm - 1]);
-//             T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm -
-//             1)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm +
-//                         1)], ratio_f_sm[f_sm - 1]);
-//             res = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl = 2 * f_gl + 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//         res = dwcf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                        v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                        ratio_c_sm[c_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     }
-//   }
-
-//   if (dwrf && threadId >= (R/2) * (C/2) * (F/2) * 5 && threadId < (R/2) *
-//   (C/2) * (F/2) * 6) {
-//     r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 5) / ((C/2) * (F/2))) * 2 +
-//     1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 5) % ((C/2) * (F/2))) /
-//     (F/2)) * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 5) % ((C/2) *
-//     (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) *
-//     (F/2) * 5) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) *
-//     (F/2) * 5) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2)
-//     * (C/2) * (F/2) * 5) % ((C/2) * (F/2))) % (F/2);
-
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-
-//         res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION && COEFF_RESTORE) {
-//             T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm -
-//             1)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm +
-//                         1)], ratio_f_sm[f_sm - 1]);
-//             T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm -
-//             1)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm +
-//                         1)], ratio_f_sm[f_sm - 1]);
-
-//             res += lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//           } else if (INTERPOLATION && !COEFF_RESTORE) {
-//             T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm -
-//             1)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm +
-//                         1)], ratio_f_sm[f_sm - 1]);
-//             T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm -
-//             1)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm +
-//                         1)], ratio_f_sm[f_sm - 1]);
-
-//             res = lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl = 2 * f_gl + 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//         res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                        v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                        ratio_r_sm[r_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     }
-//   }
-
-//   if (dwrc && threadId >= (R/2) * (C/2) * (F/2) * 6 && threadId < (R/2) *
-//   (C/2) * (F/2) * 7) {
-//     r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 6) / ((C/2) * (F/2))) * 2 +
-//     1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 6) % ((C/2) * (F/2))) /
-//     (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 6) % ((C/2)
-//     * (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)
-//     * 6) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2)
-//     * 6) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) *
-//     (C/2) * (F/2) * 6) % ((C/2) * (F/2))) % (F/2);
-
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//         res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION && COEFF_RESTORE) {
-//             T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//             f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                         f_sm)], ratio_c_sm[c_sm - 1]);
-//             T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//             f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                         f_sm)], ratio_c_sm[c_sm - 1]);
-//             res += lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//           } else if (INTERPOLATION && !COEFF_RESTORE) {
-//             T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//             f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                         f_sm)], ratio_c_sm[c_sm - 1]);
-//             T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//             f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                         f_sm)], ratio_c_sm[c_sm - 1]);
-//             res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl *= 2;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//         res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//             f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                         f_sm)], ratio_c_sm[c_sm - 1]);
-//             T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//             f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                         f_sm)], ratio_c_sm[c_sm - 1]);
-//             res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     }
-//   }
-
-//   if (dwrcf && threadId >= (R/2) * (C/2) * (F/2) * 7 && threadId < (R/2) *
-//   (C/2) * (F/2) * 8) {
-//     r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 7) / ((C/2) * (F/2))) * 2 +
-//     1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) /
-//     (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 7) % ((C/2)
-//     * (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) *
-//     (F/2) * 7) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) *
-//     (F/2) * 7) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2)
-//     * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) % (F/2);
-
-//     if (TYPE == 1) {
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//         res = dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION && COEFF_RESTORE) {
-//             T f1 =
-//                 lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm -
-//                 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm +
-//                      1)], ratio_f_sm[f_sm - 1]);
-//             T f2 =
-//                 lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm -
-//                 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm +
-//                      1)], ratio_f_sm[f_sm - 1]);
-//             T f3 =
-//                 lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm -
-//                 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm +
-//                      1)], ratio_f_sm[f_sm - 1]);
-//             T f4 =
-//                 lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm -
-//                 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm +
-//                      1)], ratio_f_sm[f_sm - 1]);
-
-//             T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//             T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]);
-
-//             res += lerp(fc1, fc2, ratio_r_sm[r_sm - 1]);
-//           } else if (INTERPOLATION && !COEFF_RESTORE) {
-//             T f1 =
-//                 lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm -
-//                 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm +
-//                      1)], ratio_f_sm[f_sm - 1]);
-//             T f2 =
-//                 lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm -
-//                 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm +
-//                      1)], ratio_f_sm[f_sm - 1]);
-//             T f3 =
-//                 lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm -
-//                 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm +
-//                      1)], ratio_f_sm[f_sm - 1]);
-//             T f4 =
-//                 lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm -
-//                 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm +
-//                      1)], ratio_f_sm[f_sm - 1]);
-
-//             T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//             T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]);
-
-//             res = lerp(fc1, fc2, ratio_r_sm[r_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     } else if (TYPE == 2) {
-//       f_gl = 2 * f_gl + 1;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//         res = dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)];
-//         if (!skip) {
-//           if (INTERPOLATION) {
-//             T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//             f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                         f_sm)], ratio_c_sm[c_sm - 1]);
-//             T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//             f_sm)],
-//                         v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                         f_sm)], ratio_c_sm[c_sm - 1]);
-//             res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//           }
-//         }
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//       }
-//     }
-//   }
-
-//   if (r + (R/2) * 2 == nr_p - 1) {
-//     if (threadId < (C/2) * (F/2)) {
-//       if (dwf) {
-//         r_sm = (R/2) * 2;
-//         c_sm = (threadId / (F/2)) * 2;
-//         f_sm = (threadId % (F/2)) * 2 + 1;
-//         r_gl = r / 2 + (R/2);
-//         c_gl = c / 2 + threadId / (F/2);
-//         f_gl = f / 2 + threadId % (F/2);
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//             res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm -
-//                 1)],
-//                             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm +
-//                             1)], ratio_f_sm[f_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                            ratio_f_sm[f_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl = 2 * f_gl + 1;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//             // res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 ;
-//               }
-//             }
-//             // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-
-//       if (dwc) {
-//         r_sm = (R/2) * 2;
-//         c_sm = (threadId / (F/2)) * 2 + 1;
-//         f_sm = (threadId % (F/2)) * 2;
-//         r_gl = r / 2 + (R/2);
-//         c_gl = c / 2 + threadId / (F/2);
-//         f_gl = f / 2 + threadId % (F/2);
-
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//             res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1,
-//                 f_sm)],
-//                             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1,
-//                             f_sm)], ratio_c_sm[c_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                            ratio_c_sm[c_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl *= 2;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//             res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                            ratio_c_sm[c_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-//       if (dwcf) {
-//         r_sm = (R/2) * 2;
-//         c_sm = (threadId / (F/2)) * 2 + 1;
-//         f_sm = (threadId % (F/2)) * 2 + 1;
-//         r_gl = r / 2 + (R/2);
-//         c_gl = c / 2 + threadId / (F/2);
-//         f_gl = f / 2 + threadId % (F/2);
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//             res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 T f1 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm -
-//                     1)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm +
-//                          1)], ratio_f_sm[f_sm - 1]);
-//                 T f2 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm -
-//                     1)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm +
-//                          1)], ratio_f_sm[f_sm - 1]);
-//                 res += lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 T f1 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm -
-//                     1)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm +
-//                          1)], ratio_f_sm[f_sm - 1]);
-//                 T f2 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm -
-//                     1)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm +
-//                          1)], ratio_f_sm[f_sm - 1]);
-//                 res = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//               }
-//             }
-
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl = 2 * f_gl + 1;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//             res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                            ratio_c_sm[c_sm - 1]);
-//                 // if (idx[1] ==0 && idx[2] == 0) {
-//                 //   printf("%f(%d %d %d) %f(%d %d %d) -> %f\n",
-//                 //           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1,
-//                 f_sm)],
-//                 //           r_sm, c_sm - 1, f_sm, v_sm[get_idx(ldsm1, ldsm2,
-//                 //           r_sm, c_sm + 1, f_sm)], r_sm, c_sm + 1, f_sm,
-//                 res);
-//                 // }
-//               }
-//             }
-
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   if (c + (C/2) * 2 == nc_p - 1) {
-//     if (threadId >= (R/2) * (C/2) * (F/2) && threadId < (R/2) * (C/2) * (F/2)
-//     + (R/2) * (F/2)) {
-//       if (dwf) {
-//         r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2;
-//         c_sm = (C/2) * 2;
-//         f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2 + 1;
-//         r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2);
-//         c_gl = c / 2 + (C/2);
-//         f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2);
-
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//             res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm -
-//                 1)],
-//                             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm +
-//                             1)], ratio_f_sm[f_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                            ratio_f_sm[f_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl = 2 * f_gl + 1;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//             // res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 ;
-//               }
-//             }
-//             // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-//       if (dwr) {
-//         r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2 + 1;
-//         c_sm = (C/2) * 2;
-//         f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2;
-//         r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2);
-//         c_gl = c / 2 + (C/2);
-//         f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2);
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//             res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//                 f_sm)],
-//                             v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm,
-//                             f_sm)], ratio_r_sm[r_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                            ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl *= 2;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//             res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                            ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-//       if (dwrf) {
-//         r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2 + 1;
-//         c_sm = (C/2) * 2;
-//         f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2 + 1;
-//         r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2);
-//         c_gl = c / 2 + (C/2);
-//         f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2);
-
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//             res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 T f1 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm -
-//                     1)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm +
-//                          1)], ratio_f_sm[f_sm - 1]);
-//                 T f2 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm -
-//                     1)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm +
-//                          1)], ratio_f_sm[f_sm - 1]);
-//                 res += lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 T f1 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm -
-//                     1)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm +
-//                          1)], ratio_f_sm[f_sm - 1]);
-//                 T f2 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm -
-//                     1)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm +
-//                          1)], ratio_f_sm[f_sm - 1]);
-//                 res = lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl = 2 * f_gl + 1;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//             res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                            ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   if (f + (F/2) * 2 == nf_p - 1) {
-//     if (threadId >= (R/2) * (C/2) * (F/2) * 2 && threadId < (R/2) * (C/2) *
-//     (F/2) * 2 + (R/2) * (C/2)) {
-//       if (dwc) {
-//         r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2;
-//         c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2 + 1;
-//         f_sm = (F/2) * 2;
-//         r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2);
-//         c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2);
-//         f_gl = f / 2 + (F/2);
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//             res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1,
-//                 f_sm)],
-//                             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1,
-//                             f_sm)], ratio_c_sm[c_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                            ratio_c_sm[c_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl *= 2;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//             res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                            ratio_c_sm[c_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-
-//       if (dwr) {
-//         r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2 + 1;
-//         c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2;
-//         f_sm = (F/2) * 2;
-//         r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2);
-//         c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2);
-//         f_gl = f / 2 + (F/2);
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//             res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//                 f_sm)],
-//                             v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm,
-//                             f_sm)], ratio_r_sm[r_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                            ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl *= 2;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//             res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                            ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-
-//       if (dwrc) {
-//         r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2 + 1;
-//         c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2 + 1;
-//         f_sm = (F/2) * 2;
-//         r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2);
-//         c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2);
-//         f_gl = f / 2 + (F/2);
-
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//             res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 T c1 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//                     f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                          f_sm)], ratio_c_sm[c_sm - 1]);
-//                 T c2 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//                     f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                          f_sm)], ratio_c_sm[c_sm - 1]);
-//                 res += lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 T c1 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//                     f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                          f_sm)], ratio_c_sm[c_sm - 1]);
-//                 T c2 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//                     f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                          f_sm)], ratio_c_sm[c_sm - 1]);
-//                 res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl *= 2;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//             res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 T c1 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1,
-//                     f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1,
-//                          f_sm)], ratio_c_sm[c_sm - 1]);
-//                 T c2 =
-//                     lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1,
-//                     f_sm)],
-//                          v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1,
-//                          f_sm)], ratio_c_sm[c_sm - 1]);
-//                 res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   if (c + (C/2) * 2 == nc_p - 1 && f + (F/2) * 2 == nf_p - 1) {
-//     if (threadId >= (R/2) * (C/2) * (F/2) * 3 && threadId < (R/2) * (C/2) *
-//     (F/2) * 3 + (R/2)) {
-//       if (dwr) {
-//         r_sm = (threadId - (R/2) * (C/2) * (F/2) * 3) * 2 + 1;
-//         c_sm = (C/2) * 2;
-//         f_sm = (F/2) * 2;
-//         r_gl = r / 2 + threadId - (R/2) * (C/2) * (F/2) * 3;
-//         c_gl = c / 2 + (C/2);
-//         f_gl = f / 2 + (F/2);
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//             res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//                 f_sm)],
-//                             v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm,
-//                             f_sm)], ratio_r_sm[r_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                            ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl *= 2;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) {
-//             res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                            ratio_r_sm[r_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   if (r + (R/2) * 2 == nr_p - 1 && f + (F/2) * 2 == nf_p - 1) {
-//     if (threadId >= (R/2) * (C/2) * (F/2) * 4 && threadId < (R/2) * (C/2) *
-//     (F/2) * 4 + (C/2)) {
-//       if (dwc) {
-//         r_sm = (R/2) * 2;
-//         c_sm = (threadId - (R/2) * (C/2) * (F/2) * 4) * 2 + 1;
-//         f_sm = (F/2) * 2;
-//         r_gl = r / 2 + (R/2);
-//         c_gl = c / 2 + threadId - (R/2) * (C/2) * (F/2) * 4;
-//         f_gl = f / 2 + (F/2);
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//             res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1,
-//                 f_sm)],
-//                             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1,
-//                             f_sm)], ratio_c_sm[c_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                            ratio_c_sm[c_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl *= 2;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) {
-//             res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                            ratio_c_sm[c_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   if (r + (R/2) * 2 == nr_p - 1 && c + (C/2) * 2 == nc_p - 1) {
-//     if (threadId >= (R/2) * (C/2) * (F/2) * 5 && threadId < (R/2) * (C/2) *
-//     (F/2) * 5 + (F/2)) {
-//       if (dwf) {
-//         r_sm = (R/2) * 2;
-//         c_sm = (C/2) * 2;
-//         f_sm = (threadId - (R/2) * (C/2) * (F/2) * 5) * 2 + 1;
-//         r_gl = r / 2 + (R/2);
-//         c_gl = c / 2 + (C/2);
-//         f_gl = f / 2 + threadId - (R/2) * (C/2) * (F/2) * 5;
-//         if (TYPE == 1) {
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//             res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION && COEFF_RESTORE) {
-//                 res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm -
-//                 1)],
-//                             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm +
-//                             1)], ratio_f_sm[f_sm - 1]);
-//               } else if (INTERPOLATION && !COEFF_RESTORE) {
-//                 res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                            v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                            ratio_f_sm[f_sm - 1]);
-//               }
-//             }
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         } else if (TYPE == 2) {
-//           f_gl = 2 * f_gl + 1;
-//           if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//               r_gl < nr_c && c_gl < nc_c && f_gl < nf) {
-//             // res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//             if (!skip) {
-//               if (INTERPOLATION) {
-//                 ;
-//               }
-//             }
-//             // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   // __syncthreads();
-//   // if (debug) {
-//   //   printf("TYPE: %d %d %d %d\n", TYPE, min(rest_r_p, (R/2) * 2 + 1),
-//   //          min(rest_c_p, (C/2) * 2 + 1), min(rest_f_p, (F/2) * 2 + 1));
-//   //   for (int i = 0; i < min(rest_r_p, (R/2) * 2 + 1); i++) {
-//   //     for (int j = 0; j < min(rest_c_p, (C/2) * 2 + 1); j++) {
-//   //       for (int k = 0; k < min(rest_f_p, (F/2) * 2 + 1); k++) {
-//   //         printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   // }
-//   // __syncthreads();
-
-//   __syncthreads();
-
-//   r_sm = threadIdx.z;
-//   c_sm = threadIdx.y;
-//   f_sm = threadIdx.x;
-
-//   r_sm_ex = FunctorBase<DeviceType>::GetBlockDimZ();
-//   c_sm_ex = FunctorBase<DeviceType>::GetBlockDimY();
-//   f_sm_ex = FunctorBase<DeviceType>::GetBlockDimX();
-
-//   r_gl = r + r_sm;
-//   c_gl = c + c_sm;
-//   f_gl = f + f_sm;
-
-//   // r_gl_ex = r + (R/2) * 2;
-//   // c_gl_ex = c + (C/2) * 2;
-//   // f_gl_ex = f + (F/2) * 2;
-
-//   r_gl_ex = r + rest_r - 1;
-//   c_gl_ex = c + rest_c - 1;
-//   f_gl_ex = f + rest_f - 1;
-
-//   int unpadding_r = rest_r;
-//   int unpadding_c = rest_c;
-//   int unpadding_f = rest_f;
-//   if (nr % 2 == 0)
-//     unpadding_r -= 1;
-//   if (nc % 2 == 0)
-//     unpadding_c -= 1;
-//   if (TYPE == 1 && nf % 2 == 0)
-//     unpadding_f -= 1;
-
-//   if (r_sm < unpadding_r && c_sm < unpadding_c && f_sm < unpadding_f) {
-
-//     // store extra rules
-//     // case 1: input = odd (non-padding required)
-//     //    case 1.a: block size + 1 == rest (need to store extra);
-//     //    case 1.b: block size + 1 != rest (No need to store extra);
-//     // case 2: input = even (un-padding requried)
-//     //    case 2.a: block size + 1 >= rest (No need to store extra, but need
-//     //    un-padding first); case 2.b: block size + 1 < rest (No need to
-//     store
-//     //    extra);
-
-//     if (D_LOCAL >= 3 && r_sm == 0) {
-//       if (nr % 2 != 0 && (R/2) * 2 + 1 == rest_r) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)];
-//         }
-//       }
-//       if (nr % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)];
-//       }
-//     }
-
-//     if (D_LOCAL >= 2 && c_sm == 0) {
-//       if (nc % 2 != 0 && (C/2) * 2 + 1 == rest_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)];
-//         }
-//       }
-//       if (nc % 2 == 0 && (C/2) * 2 + 1 >= rest_c_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)];
-//       }
-//     }
-
-//     if (D_LOCAL >= 1 && f_sm == 0) {
-//       if (nf % 2 != 0 && (F/2) * 2 + 1 == rest_f) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)];
-//         }
-//       }
-//       if (nf % 2 == 0 && (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p - 1)];
-//       }
-//     }
-
-//     // load extra edges
-//     if (D_LOCAL >= 2 && c_sm == 0 && f_sm == 0) {
-//       if (nc % 2 != 0 && (C/2) * 2 + 1 == rest_c && nf % 2 != 0 &&
-//           (F/2) * 2 + 1 == rest_f) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)];
-//         }
-//       }
-//       if (nc % 2 == 0 && nf % 2 == 0 && (C/2) * 2 + 1 >= rest_c_p &&
-//           (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, rest_f - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, rest_f_p - 1)];
-//       }
-//       if (nc % 2 == 0 && nf % 2 != 0 && (C/2) * 2 + 1 >= rest_c_p &&
-//           (F/2) * 2 + 1 == rest_f) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)];
-//         }
-//       }
-//       if (nc % 2 != 0 && nf % 2 == 0 && (C/2) * 2 + 1 == rest_c &&
-//           (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)];
-//           // printf("(%d %d %d): %f <- (%d %d %d)\n",
-//           //         r_gl, c_gl_ex, f_gl_ex,
-//           //         dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)],
-//           //         r_sm, c_sm_ex, f_gl_ex);
-//         }
-//       }
-//     }
-
-//     if (D_LOCAL >= 3 && r_sm == 0 && f_sm == 0) {
-//       if (nr % 2 != 0 && (R/2) * 2 + 1 == rest_r && nf % 2 != 0 &&
-//           (F/2) * 2 + 1 == rest_f) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)];
-//         }
-//       }
-//       if (nr % 2 == 0 && nf % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p &&
-//           (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, rest_f - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, rest_f_p - 1)];
-//       }
-//       if (nr % 2 == 0 && nf % 2 != 0 && (R/2) * 2 + 1 >= rest_r_p &&
-//           (F/2) * 2 + 1 == rest_f) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)];
-//         }
-//       }
-//       if (nr % 2 != 0 && nf % 2 == 0 && (R/2) * 2 + 1 == rest_r &&
-//           (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)];
-//           // printf("(%d %d %d): %f <- (%d %d %d)\n",
-//           //         r_gl_ex, c_gl, rest_f-1,
-//           //         dv[get_idx(lddv1, lddv2, r_gl_ex-1, c_gl, f_gl_ex)],
-//           //         r_sm_ex, c_sm, rest_f_p-1);
-//         }
-//       }
-//     }
-
-//     if (D_LOCAL >= 3 && r_sm == 0 && c_sm == 0) {
-//       if (nr % 2 != 0 && (R/2) * 2 + 1 == rest_r && nc % 2 != 0 &&
-//           (C/2) * 2 + 1 == rest_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)];
-//         }
-//       }
-//       if (nr % 2 == 0 && nc % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p &&
-//           (C/2) * 2 + 1 >= rest_c_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm)];
-//       }
-//       if (nr % 2 == 0 && nc % 2 != 0 && (R/2) * 2 + 1 >= rest_r_p &&
-//           (C/2) * 2 + 1 == rest_c) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)];
-//         }
-//       }
-//       if (nr % 2 != 0 && nc % 2 == 0 && (R/2) * 2 + 1 == rest_r &&
-//           (C/2) * 2 + 1 >= rest_c_p) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)];
-//         }
-//       }
-//     }
-//     // load extra vertex
-
-//     if (D_LOCAL >= 3 && r_sm == 0 && c_sm == 0 && f_sm == 0) {
-//       if (nr % 2 != 0 && (R/2) * 2 + 1 == rest_r && nc % 2 != 0 &&
-//           (C/2) * 2 + 1 == rest_c && nf % 2 != 0 && (F/2) * 2 + 1 == rest_f)
-//           {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)];
-//         }
-//       }
-
-//       if (nr % 2 == 0 && nc % 2 == 0 && nf % 2 == 0 && (R/2) * 2 + 1 >=
-//       rest_r_p &&
-//           (C/2) * 2 + 1 >= rest_c_p && (F/2) * 2 + 1 >= rest_f_p && TYPE ==
-//           1) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, rest_f - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1,
-//                          rest_f_p - 1)];
-//       }
-//       if (nr % 2 == 0 && nc % 2 == 0 && nf % 2 != 0 && (R/2) * 2 + 1 >=
-//       rest_r_p &&
-//           (C/2) * 2 + 1 >= rest_c_p && (F/2) * 2 + 1 == rest_f) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1,
-//               f_sm_ex)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1,
-//               f_sm_ex)];
-//         }
-//       }
-//       if (nr % 2 == 0 && nc % 2 != 0 && nf % 2 == 0 && (R/2) * 2 + 1 >=
-//       rest_r_p &&
-//           (C/2) * 2 + 1 == rest_c && (F/2) * 2 + 1 >= rest_f_p && TYPE == 1)
-//           {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p -
-//               1)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p -
-//               1)];
-//         }
-//       }
-//       if (nr % 2 != 0 && nc % 2 == 0 && nf % 2 == 0 && (R/2) * 2 + 1 ==
-//       rest_r &&
-//           (C/2) * 2 + 1 >= rest_c_p && (F/2) * 2 + 1 >= rest_f_p && TYPE ==
-//           1) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p -
-//               1)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p -
-//               1)];
-//         }
-//       }
-//       if (nr % 2 == 0 && nc % 2 != 0 && nf % 2 != 0 && (R/2) * 2 + 1 >=
-//       rest_r_p &&
-//           (C/2) * 2 + 1 == rest_c && (F/2) * 2 + 1 == rest_f) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)];
-//         }
-//       }
-//       if (nr % 2 != 0 && nc % 2 == 0 && nf % 2 != 0 && (R/2) * 2 + 1 ==
-//       rest_r &&
-//           (C/2) * 2 + 1 >= rest_c_p && (F/2) * 2 + 1 == rest_f) {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)];
-//         }
-//       }
-//       if (nr % 2 != 0 && nc % 2 != 0 && nf % 2 == 0 && (R/2) * 2 + 1 ==
-//       rest_r &&
-//           (C/2) * 2 + 1 == rest_c && (F/2) * 2 + 1 >= rest_f_p && TYPE == 1)
-//           {
-//         if (!INTERPOLATION && COEFF_RESTORE) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] +=
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)];
-//         } else {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)];
-//         }
-//       }
-//     }
-//   }
-
-//   __syncthreads();
-
-//   if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) {
-//     if (r_gl >= svr && r_gl < svr + nvr && c_gl >= svc && c_gl < svc + nvc &&
-//         f_gl >= svf && f_gl < svf + nvf) {
-//       if (!INTERPOLATION && COEFF_RESTORE) {
-//         dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)] +=
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       } else {
-//         dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//       }
-//     }
-//   }
-// }
-
-// template <DIM D_GLOBAL, DIM D_LOCAL, typename T, SIZE R, SIZE C, SIZE F,
-//           bool INTERPOLATION, bool COEFF_RESTORE, int TYPE>
-// void gpk_rev_adaptive_launcher(
-//     Handle<D_GLOBAL, T> &handle, SIZE *shape_h, SIZE *shape_d, SIZE
-//     *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n, DIM
-//     *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T
-//     *dratio_r, T *dratio_c, T *dratio_f, T *dv, LENGTH lddv1, LENGTH lddv2, T
-//     *dw, LENGTH lddw1, LENGTH lddw2, T *dwf, LENGTH lddwf1, LENGTH lddwf2, T
-//     *dwc, LENGTH lddwc1, LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2,
-//     T *dwcf, LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH
-//     lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, LENGTH
-//     lddwrcf1, LENGTH lddwrcf2, SIZE svr, SIZE svc, SIZE svf, SIZE nvr, SIZE
-//     nvc, SIZE nvf, int queue_idx) {
-
-//   SIZE nr = shape_h[curr_dim_r];
-//   SIZE nc = shape_h[curr_dim_c];
-//   SIZE nf = shape_h[curr_dim_f];
-//   if (D_LOCAL == 2) {
-//     nr = 1;
-//   }
-//   SIZE total_thread_z = std::max(nr - 1, (SIZE)1);
-//   SIZE total_thread_y = std::max(nc - 1, (SIZE)1);
-//   SIZE total_thread_x = std::max(nf - 1, (SIZE)1);
-
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   // tbz = std::min(R, total_thread_z);
-//   // tby = std::min(C, total_thread_y);
-//   // tbx = std::min(F, total_thread_x);
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = ((R + 1) * (C + 1) * (F + 1) + R + C + F) * sizeof(T);
-//   sm_size += (D_GLOBAL * 4) * sizeof(SIZE);
-//   sm_size += (D_GLOBAL * 1) * sizeof(DIM);
-
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   for (DIM d = 0; d < D_GLOBAL; d++) {
-//     if (D_LOCAL == 3 && d != curr_dim_f && d != curr_dim_c && d !=
-//     curr_dim_r) {
-//       gridx *= shape_h[d];
-//     }
-//     if (D_LOCAL == 2 && d != curr_dim_f && d != curr_dim_c) {
-//       gridx *= shape_h[d];
-//     }
-//   }
-
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-
-//   // printf("gpk_rev exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx, gridy,
-//   // gridz);
-//   _gpk_rev<D_GLOBAL, D_LOCAL, T, R, C, F, INTERPOLATION,
-//            COEFF_RESTORE, TYPE><<<blockPerGrid, threadsPerBlock, sm_size,
-//                                   *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       shape_d, shape_c_d, ldvs, ldws, unprocessed_n, unprocessed_dims,
-//       curr_dim_r, curr_dim_c, curr_dim_f, dratio_r, dratio_c, dratio_f, dv,
-//       lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1,
-//       lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1,
-//       lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2, svr, svc,
-//       svf, nvr, nvc, nvf);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D_GLOBAL, DIM D_LOCAL, typename T, bool INTERPOLATION,
-//           bool COEFF_RESTORE, int TYPE>
-// void gpk_rev(Handle<D_GLOBAL, T> &handle, SIZE *shape_h, SIZE *shape_d,
-//              SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n,
-//              DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c,
-//              DIM curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv,
-//              LENGTH lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2, T
-//              *dwf, LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1,
-//              LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf,
-//              LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH
-//              lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf,
-//              LENGTH lddwrcf1, LENGTH lddwrcf2, SIZE svr, SIZE svc, SIZE svf,
-//              SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx, int config) {
-
-//   #define GPK(R, C, F)                                                           \
-//   {                                                                            \
-//     gpk_rev_adaptive_launcher<D_GLOBAL, D_LOCAL, T, R, C, F, INTERPOLATION,    \
-//                               COEFF_RESTORE, TYPE>(                            \
-//         handle, shape_h, shape_d, shape_c_d, ldvs, ldws, unprocessed_n,        \
-//         unprocessed_dims, curr_dim_r, curr_dim_c, curr_dim_f, dratio_r,        \
-//         dratio_c, dratio_f, dv, lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1,   \
-//         lddwf2, dwc, lddwc1, lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1,       \
-//         lddwcf2, dwrf, lddwrf1, lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf,        \
-//         lddwrcf1, lddwrcf2, svr, svc, svf, nvr, nvc, nvf, queue_idx);          \
-//   }
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D_LOCAL == 3) {
-//     // if (profile || config == 6) {
-//     //   GPK(2, 2, 128)
-//     // }
-//     // if (profile || config == 5) {
-//     //   GPK(2, 2, 64)
-//     // }
-//     // if (profile || config == 4) {
-//     //   GPK(4, 4, 32)
-//     // }
-//     // if (profile || config == 3) {
-//     //   GPK(4, 4, 16)
-//     // }
-//     // if (profile || config == 2) {
-//     //   GPK(4, 4, 8)
-//     // }
-//     // if (profile || config == 1) {
-//       GPK(4, 4, 4)
-//     // }
-//     // if (profile || config == 0) {
-//     //   GPK(4, 4, 4)
-//     // }
-//   } else if (D_LOCAL == 2) {
-//     if (profile || config == 6) {
-//       GPK(1, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(1, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(1, 4, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(1, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(1, 4, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(1, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       GPK(1, 2, 4)
-//     }
-//   } else if (D_LOCAL == 1) {
-//     if (profile || config == 6) {
-//       GPK(1, 1, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(1, 1, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(1, 1, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(1, 1, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(1, 1, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(1, 1, 8)
-//     }
-//     if (profile || config == 0) {
-//       GPK(1, 1, 8)
-//     }
-//   }
-//   #undef GPK
-// }
-
 } // namespace mgard_x
 
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.h b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.h
deleted file mode 100644
index 4769f7cdef..0000000000
--- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_GRID_PROCESSING_KERNEL_3D
-#define MGARD_X_GRID_PROCESSING_KERNEL_3D
-
-#include "../../Common.h"
-
-namespace mgard_x {
-
-template <DIM D, typename T>
-void gpk_reo_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r,
-                T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw,
-                SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2,
-                T *dwc, SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE lddwr1,
-                SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE lddwcf2, T *dwrf,
-                SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, SIZE lddwrc2,
-                T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, int queue_idx,
-                int config);
-
-template <DIM D, typename T>
-void gpk_rev_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r,
-                T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw,
-                SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2,
-                T *dwc, SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE lddwr1,
-                SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE lddwcf2, T *dwrf,
-                SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, SIZE lddwrc2,
-                T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, SIZE svr, SIZE svc,
-                SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx,
-                int config);
-
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.hpp
index 1c40a28602..8e2a981bea 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.hpp
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.hpp
@@ -1227,15 +1227,12 @@ class GpkReo3D : public AutoTuner<DeviceType> {
                SubArray<D, T, DeviceType> wrc, SubArray<D, T, DeviceType> wrcf,
                int queue_idx) {
     int range_l = std::min(6, (int)std::log2(nf) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_cc[arch][prec][range_l];
     int config =
         AutoTuner<DeviceType>::autoTuningTable.gpk_reo_3d[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define GPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -1248,22 +1245,26 @@ class GpkReo3D : public AutoTuner<DeviceType> {
                                      ratio_c, ratio_f, v, w, wf, wc, wr, wcf,  \
                                      wrf, wrc, wrcf, queue_idx);               \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    GPK(0)
-    GPK(1)
-    GPK(2)
-    GPK(3)
-    GPK(4)
-    GPK(5)
-    GPK(6)
+    GPK(6) if (!ret.success) config--;
+    GPK(5) if (!ret.success) config--;
+    GPK(4) if (!ret.success) config--;
+    GPK(3) if (!ret.success) config--;
+    GPK(2) if (!ret.success) config--;
+    GPK(1) if (!ret.success) config--;
+    GPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for GpkReo3D.\n";
+      exit(-1);
+    }
 #undef GPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -2444,15 +2445,12 @@ class GpkRev3D : public AutoTuner<DeviceType> {
                SIZE svr, SIZE svc, SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf,
                int queue_idx) {
     int range_l = std::min(6, (int)std::log2(nf) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_cc[arch][prec][range_l];
     int config =
         AutoTuner<DeviceType>::autoTuningTable.gpk_rev_3d[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define GPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -2465,22 +2463,26 @@ class GpkRev3D : public AutoTuner<DeviceType> {
         nr, nc, nf, nr_c, nc_c, nf_c, ratio_r, ratio_c, ratio_f, v, w, wf, wc, \
         wr, wcf, wrf, wrc, wrcf, svr, svc, svf, nvr, nvc, nvf, queue_idx);     \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    GPK(0)
-    GPK(1)
-    GPK(2)
-    GPK(3)
-    GPK(4)
-    GPK(5)
-    GPK(6)
+    GPK(6) if (!ret.success) config--;
+    GPK(5) if (!ret.success) config--;
+    GPK(4) if (!ret.success) config--;
+    GPK(3) if (!ret.success) config--;
+    GPK(2) if (!ret.success) config--;
+    GPK(1) if (!ret.success) config--;
+    GPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for GpkRev3D.\n";
+      exit(-1);
+    }
 #undef GPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -2489,2444 +2491,6 @@ class GpkRev3D : public AutoTuner<DeviceType> {
   }
 };
 
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// MGARDX_EXEC void
-// __gpk_reo_3d(IDX ngridz, IDX ngridy, IDX ngridx,
-//              IDX nblockz, IDX nblocky, IDX nblockx,
-//              IDX blockz, IDX blocky, IDX blockx,
-//              IDX threadz, IDX thready, IDX threadx,
-//             SIZE nr, SIZE nc, SIZE nf,
-//             SIZE nr_c, SIZE nc_c, SIZE nf_c,
-//             T *dratio_r,
-//             T *dratio_c, T *dratio_f,
-//             T *dv, SIZE lddv1, SIZE lddv2,
-//             T *dw, SIZE lddw1, SIZE lddw2,
-//             T *dwf, SIZE lddwf1, SIZE lddwf2,
-//             T *dwc, SIZE lddwc1, SIZE lddwc2,
-//             T *dwr, SIZE lddwr1, SIZE lddwr2,
-//             T *dwcf, SIZE lddwcf1, SIZE lddwcf2,
-//             T *dwrf, SIZE lddwrf1, SIZE lddwrf2,
-//             T *dwrc, SIZE lddwrc1, SIZE lddwrc2,
-//             T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2) {
-
-//   // // to be removed
-//   int TYPE = 1;
-//   bool INTERPOLATION = true;
-//   bool CALC_COEFF = true;
-//   bool in_next = false;
-//   bool skip = false;
-
-//   SIZE r, c, f;
-//   SIZE rest_r, rest_c, rest_f;
-//   SIZE nr_p, nc_p, nf_p;
-//   SIZE rest_r_p, rest_c_p, rest_f_p;
-//   SIZE r_sm, c_sm, f_sm;
-//   SIZE r_sm_ex, c_sm_ex, f_sm_ex;
-//   SIZE r_gl, c_gl, f_gl;
-//   SIZE r_gl_ex, c_gl_ex, f_gl_ex;
-//   LENGTH threadId;
-
-//   T res;
-
-//   // r = blockIdx.z * blockDim.z;
-//   // c = blockIdx.y * blockDim.y;
-//   // f = blockIdx.x * blockDim.x;
-
-//   r = blockz * nblockz;
-//   c = blocky * nblocky;
-//   f = blockx * nblockx;
-
-//   rest_r = nr - r;
-//   rest_c = nc - c;
-//   rest_f = nf - f;
-
-//   nr_p = nr;
-//   nc_p = nc;
-//   nf_p = nf;
-
-//   rest_r_p = rest_r;
-//   rest_c_p = rest_c;
-//   rest_f_p = rest_f;
-
-//   if (nr % 2 == 0) {
-//     nr_p = nr + 1;
-//     rest_r_p = nr_p - r;
-//   }
-//   if (nc % 2 == 0) {
-//     nc_p = nc + 1;
-//     rest_c_p = nc_p - c;
-//   }
-//   if (nf % 2 == 0) {
-//     nf_p = nf + 1;
-//     rest_f_p = nf_p - f;
-//   }
-
-//   // r_sm = threadIdx.z;
-//   // c_sm = threadIdx.y;
-//   // f_sm = threadIdx.x;
-
-//   r_sm = threadz;
-//   c_sm = thready;
-//   f_sm = threadx;
-
-//   r_sm_ex = R * 2;
-//   c_sm_ex = C * 2;
-//   f_sm_ex = F * 2;
-
-//   // threadId = (threadIdx.z * (blockDim.x * blockDim.y)) +
-//   //            (threadIdx.y * blockDim.x) + threadIdx.x;
-
-//  threadId = (threadz * (nblockx * nblocky)) +
-//  (thready * nblockx) + threadx;
-
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = F * 2 + 1;
-//   SIZE ldsm2 = C * 2 + 1;
-//   T *v_sm = sm;
-//   T *ratio_f_sm = sm + (F * 2 + 1) * (C * 2 + 1) * (R * 2 + 1);
-//   T *ratio_c_sm = ratio_f_sm + F * 2;
-//   T *ratio_r_sm = ratio_c_sm + C * 2;
-
-//   r_gl = r + r_sm;
-//   r_gl_ex = r + R * 2;
-//   c_gl = c + c_sm;
-//   c_gl_ex = c + C * 2;
-//   f_gl = f + f_sm;
-//   f_gl_ex = f + F * 2;
-
-//   //  __syncthreads();
-//   // if (r_sm == 0 && c_sm == 0 && f_sm == 0) {
-//   //   //printf("setting zeros\n");
-//   //   for (int i = 0; i < R * 2 + 1; i++) {
-//   //     for (int j = 0; j < C * 2 + 1; j++) {
-//   //       for (int k = 0; k < F * 2 + 1; k++) {
-//   //         v_sm[get_idx(ldsm1, ldsm2, i, j, k)] = 0.0;
-//   //       }
-//   //     }
-//   //   }
-//   //   //printf("done zeros\n");
-//   // }
-//   //  __syncthreads();
-//   /* Load v */
-//   // loading extra rules
-//   // case 1: input = odd (non-padding required)
-//   //    case 1.a: block size < rest (need to load extra);
-//   //    case 1.b: block size > rest (NO need to load extra);
-//   // case 2: input = even (padding requried)
-//   //    case 2.a: block size < rest (need to load extra);
-//   //    case 2.b: block size >= rest (NO need to load extra, but need
-//   //    padding);
-
-//   // Load from dv
-//   if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) {
-
-//     // load cubic
-//     // asm volatile("membar.cta;");
-//     // start = clock64();
-//     v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//         dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)];
-//     // if (blockIdx.x==0 && blockIdx.y==0&&blockIdx.z==0) {
-//     //   printf("load (%d %d %d) %f <- %d+(%d %d %d) (ld: %d %d)\n",
-//     //           r_sm, c_sm, f_sm,
-//     //           dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)],
-//     //           other_offset_v+r_gl, c_gl, f_gl, lddv1, lddv2);
-//     // }
-//     if (r_sm == 0) {
-//       if (rest_r > R * 2) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)] =
-//             dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)];
-//       }
-//     }
-//     if (c_sm == 0) {
-//       if (rest_c > C * 2) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)] =
-//             dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)];
-//       }
-//     }
-//     if (f_sm == 0) {
-//       if (rest_f > F * 2) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)] =
-//             dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)];
-//       }
-//     }
-//     if (c_sm == 0 && f_sm == 0) {
-//       if (rest_c > C * 2 && rest_f > F * 2) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)] =
-//             dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)];
-//       }
-//     }
-//     if (r_sm == 0 && f_sm == 0) {
-//       if (rest_r > R * 2 && rest_f > F * 2) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)] =
-//             dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)];
-//       }
-//     }
-//     if (r_sm == 0 && c_sm == 0) {
-//       if (rest_r > R * 2 && rest_c > C * 2) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)] =
-//             dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)];
-//       }
-//     }
-//     if (r_sm == 0 && c_sm == 0 && f_sm == 0) {
-//       if (rest_r > R * 2 && rest_c > C * 2 && rest_f > F * 2) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)] =
-//             dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)];
-//       }
-//     }
-//   }
-
-//   __syncthreads();
-
-//   // apply padding is necessary
-//   if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) {
-
-//     // printf("load main[%d %d %d]:%f --> [%d %d %d] (%d %d %d)\n", r_gl,
-//     // c_gl, f_gl,
-//     //     dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)], r_sm, c_sm, f_sm, nr,
-//     //     nc, nf);
-
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[load main] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//     // blockIdx.y, blockIdx.x, start); start = clock64();
-
-//     // load extra surface
-
-//     if (r_sm == 0) {
-//       if (rest_r > R * 2) {
-//         // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)] =
-//         //     dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)];
-//         // printf("load-r[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl,
-//         f_gl,
-//         //   dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)], r_sm_ex, c_sm,
-//         //   f_sm);
-//       } else if (nr % 2 == 0) {
-//         // if (r == 16 && c == 0 && f == 0) {
-//         //   printf("padding (%d %d %d) %f <- (%f %f %f)\n", rest_r_p - 1,
-//         //   c_sm, f_sm,
-//         //         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)],
-//         rest_r
-//         //         - 1, c_sm, f_sm);
-//         //   padded = true;
-//         //   aa = v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)];
-//         //   bb = v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)];
-//         // }
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)];
-//       }
-//     }
-
-//     if (c_sm == 0) {
-//       if (rest_c > C * 2) {
-//         // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)] =
-//         //     dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)];
-//         // printf("load-c[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl_ex,
-//         f_gl,
-//         //   dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)], r_sm, c_sm_ex,
-//         //   f_sm);
-//       } else if (nc % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm)];
-//       }
-//     }
-
-//     if (f_sm == 0) {
-//       if (rest_f > F * 2) {
-//         // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)] =
-//         //     dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)];
-//         // printf("load-f[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl,
-//         f_gl_ex,
-//         //   dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)], r_sm, c_sm,
-//         //   f_sm_ex);
-//       } else if (nf % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f - 1)];
-//       }
-//     }
-
-//     // load extra edges
-//     if (c_sm == 0 && f_sm == 0) {
-//       if (rest_c > C * 2 && rest_f > F * 2) {
-//         // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)] =
-//         //     dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)];
-//         // printf("load-cf[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl_ex,
-//         // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)], r_sm,
-//         // c_sm_ex, f_sm_ex);
-//       } else if (rest_c <= C * 2 && rest_f <= F * 2 && nc % 2 == 0 &&
-//                  nf % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, rest_f_p - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, rest_f - 1)];
-//       } else if (rest_c > C * 2 && rest_f <= F * 2 && nf % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f - 1)];
-//       } else if (rest_c <= C * 2 && rest_f > F * 2 && nc % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm_ex)];
-//       }
-//     }
-
-//     if (r_sm == 0 && f_sm == 0) {
-//       if (rest_r > R * 2 && rest_f > F * 2) {
-//         // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)] =
-//         //     dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)];
-//         // printf("load-rf[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl,
-//         // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)],
-//         // r_sm_ex, c_sm, f_sm_ex);
-//       } else if (rest_r <= R * 2 && rest_f <= F * 2 && nr % 2 == 0 &&
-//                  nf % 2 == 0) {
-//         // printf("padding (%d %d %d) <- (%d %d %d)\n", rest_r_p - 1, c_sm,
-//         // rest_f_p - 1, rest_r - 1, c_sm, rest_f - 1);
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, rest_f_p - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, rest_f - 1)];
-//       } else if (rest_r > R * 2 && rest_f <= F * 2 && nf % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f - 1)];
-//       } else if (rest_r <= R * 2 && rest_f > F * 2 && nr % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm_ex)];
-//       }
-//     }
-
-//     if (r_sm == 0 && c_sm == 0) {
-//       if (rest_r > R * 2 && rest_c > C * 2) {
-//         // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)] =
-//         //     dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)];
-//         // printf("load-rc[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl_ex,
-//         // f_gl, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)], r_sm_ex,
-//         // c_sm_ex, f_sm);
-//       } else if (rest_r <= R * 2 && rest_c <= C * 2 && nr % 2 == 0 &&
-//                  nc % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)];
-//         // printf("padding (%d %d %d) <- (%d %d %d): %f\n", rest_r_p - 1,
-//         // rest_c_p - 1, f_sm, rest_r - 1, rest_c - 1, f_sm,
-//         // v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)]);
-//       } else if (rest_r > R * 2 && rest_c <= C * 2 && nc % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, f_sm)];
-//       } else if (rest_r <= R * 2 && rest_c > C * 2 && nr % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, f_sm)];
-//       }
-//     }
-//     // load extra vertex
-
-//     if (r_sm == 0 && c_sm == 0 && f_sm == 0) {
-//       if (rest_r > R * 2 && rest_c > C * 2 && rest_f > F * 2) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)] =
-//             dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)];
-//         // printf("load-rcf[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl_ex,
-//         // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)],
-//         // r_sm_ex, c_sm_ex, f_sm_ex);
-//       } else if (rest_r <= R * 2 && rest_c <= C * 2 && rest_f <= F * 2 &&
-//                  nr % 2 == 0 && nc % 2 == 0 && nf % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1,
-//                      rest_f_p - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, rest_f - 1)];
-//       } else if (rest_r > R * 2 && rest_c > C * 2 && rest_f <= F * 2 &&
-//                  nf % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f - 1)];
-//       } else if (rest_r > R * 2 && rest_c <= C * 2 && rest_f > F * 2 &&
-//                  nc % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, f_sm_ex)];
-//       } else if (rest_r > R * 2 && rest_c <= C * 2 && rest_f <= F * 2 &&
-//                  nc % 2 == 0 && nf % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, rest_f - 1)];
-//       } else if (rest_r <= R * 2 && rest_c > C * 2 && rest_f > F * 2 &&
-//                  nr % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, f_sm_ex)];
-//       } else if (rest_r <= R * 2 && rest_c > C * 2 && rest_f <= F * 2 &&
-//                  nr % 2 == 0 && nf % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, rest_f - 1)];
-//       } else if (rest_r <= R * 2 && rest_c <= C * 2 && rest_f > F * 2 &&
-//                  nr % 2 == 0 && nc % 2 == 0) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm_ex)];
-//       }
-//     }
-
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[load extra] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//     // blockIdx.y, blockIdx.x, start); start = clock64();
-
-//     // load dist
-//     if (c_sm == 0 && f_sm == 0 && r_sm < rest_r_p - 2) {
-//       // printf("%d/%d load %f\n", r_sm, rest_r - 2, dratio_r[r + r_sm]);
-//       ratio_r_sm[r_sm] = dratio_r[r + r_sm];
-//       // if (nr % 2 == 0 && R * 2 + 1 >= rest_r_p && r_sm == 0) {
-//       //   ratio_r_sm[rest_r_p - 3] = 0.5;
-//       // }
-//     }
-//     if (r_sm == 0 && f_sm == 0 && c_sm < rest_c_p - 2) {
-//       ratio_c_sm[c_sm] = dratio_c[c + c_sm];
-//       // if (nc % 2 == 0 && C * 2 + 1 >= rest_c_p && c_sm == 0) {
-//       //   ratio_c_sm[rest_c_p - 3] = 0.5;
-//       // }
-//     }
-//     if (c_sm == 0 && r_sm == 0 && f_sm < rest_f_p - 2) {
-//       ratio_f_sm[f_sm] = dratio_f[f + f_sm];
-//       // if (nf % 2 == 0 && F * 2 + 1 >= rest_f_p && f_sm == 0) {
-//       //   ratio_f_sm[rest_f_p - 3] = 0.5;
-//       // }
-//     }
-
-//     // if (r == 0 && c == 0 && f == 0 && r_sm == 0 && c_sm == 0 && f_sm == 0)
-//     // {
-//     //   printf("ratio:");
-//     //   for (int i = 0; i < R * 2 + 1; i++) {
-//     //     printf("%2.2f ", ratio_r_sm[i]);
-//     //   }
-//     //   printf("\n");
-//     // }
-
-//   } // restrict boundary
-
-//   // asm volatile("membar.cta;");
-//   // start = clock64() - start;
-//   // printf("[load ratio] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//   // blockIdx.y, blockIdx.x, start); start = clock64();
-
-//   // __syncthreads();
-//   // // debug print
-//   // if (debug) {
-//   //   printf("in config: %d %d %d (%d %d %d)\n", R, C, F, r,c,f);
-//   //   printf("rest_p: %d %d %d\n", rest_r_p, rest_c_p, rest_f_p);
-//   //   bool print = false;
-//   //   for (int i = 0; i < R * 2 + 1; i++) {
-//   //     for (int j = 0; j < C * 2 + 1; j++) {
-//   //       for (int k = 0; k < F * 2 + 1; k++) {
-//   //         // if (abs(v_sm[get_idx(ldsm1, ldsm2, i, j, k)]) > 10000) {
-//   //           // print = true;
-//   //           // printf("(block %d %d %d) %2.2f \n", r,c,f,
-//   //           v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //         // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //         // }
-//   //         printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   // }
-//   __syncthreads();
-
-//   if (dw && threadId < R * C * F) {
-//     r_sm = (threadId / (C * F)) * 2;
-//     c_sm = ((threadId % (C * F)) / F) * 2;
-//     f_sm = ((threadId % (C * F)) % F) * 2;
-//     r_gl = r / 2 + threadId / (C * F);
-//     c_gl = c / 2 + threadId % (C * F) / F;
-//     f_gl = f / 2 + threadId % (C * F) % F;
-//     res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//       // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w,
-//       // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//       // r_sm, c_sm, f_sm);
-//     }
-//   }
-
-//   // asm volatile("membar.cta;");
-//   // start = clock64() - start;
-//   // printf("[store coarse] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//   // blockIdx.y, blockIdx.x, start); start = clock64();
-//   int base = 0;
-//   // printf("TYPE =%d \n", TYPE);
-//   // printf("%d == %d && %llu >= %d && %llu < %d\n", r + R * 2, nr_p - 1,
-//   // threadId, base, threadId, base + C * F);
-
-//   if (dw && r + R * 2 == nr_p - 1 && threadId >= base &&
-//       threadId < base + C * F) {
-//     r_sm = R * 2;
-//     c_sm = ((threadId - base) / F) * 2;
-//     f_sm = ((threadId - base) % F) * 2;
-//     r_gl = r / 2 + R;
-//     c_gl = c / 2 + (threadId - base) / F;
-//     f_gl = f / 2 + (threadId - base) % F;
-//     res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//       // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w,
-//       // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//       // r_sm, c_sm, f_sm);
-//     }
-//   }
-
-//   base += C * F; // ROUND_UP_WARP(C * F) * WARP_SIZE;
-//   if (dw && c + C * 2 == nc_p - 1 && threadId >= base &&
-//       threadId < base + R * F) {
-//     r_sm = ((threadId - base) / F) * 2;
-//     c_sm = C * 2;
-//     f_sm = ((threadId - base) % F) * 2;
-//     r_gl = r / 2 + (threadId - base) / F;
-//     c_gl = c / 2 + C;
-//     f_gl = f / 2 + (threadId - base) % F;
-//     res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//       // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w,
-//       // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//       // r_sm, c_sm, f_sm);
-//     }
-//     // printf("(%d %d %d) (%d %d %d) %f\n",
-//     //         r_sm, c_sm, f_sm, r_gl, c_gl, f_gl, dwork[get_idx(lddv1,
-//     lddv2,
-//     //         r_gl, c_gl, f_gl)]);
-//   }
-
-//   base += R * F; // ROUND_UP_WARP(R * F) * WARP_SIZE;
-//   // printf("%d %d\n", base,  threadId);
-//   if (dw && f + F * 2 == nf_p - 1 && threadId >= base &&
-//       threadId < base + R * C) {
-//     r_sm = ((threadId - base) / C) * 2;
-//     c_sm = ((threadId - base) % C) * 2;
-//     f_sm = F * 2;
-//     r_gl = r / 2 + (threadId - base) / C;
-//     c_gl = c / 2 + (threadId - base) % C;
-//     f_gl = f / 2 + F;
-//     res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//       // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w,
-//       // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//       // r_sm, c_sm, f_sm);
-//     }
-//   }
-
-//   base += R * C; // ROUND_UP_WARP(R * C) * WARP_SIZE;
-//   // load extra edges
-//   if (dw && c + C * 2 == nc_p - 1 && f + F * 2 == nf_p - 1 &&
-//       threadId >= base && threadId < base + R) {
-//     r_sm = (threadId - base) * 2;
-//     c_sm = C * 2;
-//     f_sm = F * 2;
-//     r_gl = r / 2 + threadId - base;
-//     c_gl = c / 2 + C;
-//     f_gl = f / 2 + F;
-//     res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//       // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w,
-//       // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//       // r_sm, c_sm, f_sm);
-//     }
-//   }
-
-//   base += R; // ROUND_UP_WARP(R) * WARP_SIZE;
-//   // if (TYPE == 2) printf("%d %d, %d, %llu, %d\n",dw == NULL, f + F * 2,
-//   nf_p
-//   // - 1, threadId, C);
-//   if (dw && r + R * 2 == nr_p - 1 && f + F * 2 == nf_p - 1 &&
-//       threadId >= base && threadId < base + C) {
-//     r_sm = R * 2;
-//     c_sm = (threadId - base) * 2;
-//     f_sm = F * 2;
-//     r_gl = r / 2 + R;
-//     c_gl = c / 2 + threadId - base;
-//     f_gl = f / 2 + F;
-//     res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//       // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w,
-//       // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//       // r_sm, c_sm, f_sm);
-//     }
-//     // printf("store[%d %d %d]: %f\n", r_sm, c_sm, f_sm, v_sm[get_idx(ldsm1,
-//     // ldsm2, r_sm, c_sm, f_sm)]);
-//   }
-
-//   base += C; // ROUND_UP_WARP(C) * WARP_SIZE;
-//   if (dw && r + R * 2 == nr_p - 1 && c + C * 2 == nc_p - 1 &&
-//       threadId >= base && threadId < base + F) {
-//     r_sm = R * 2;
-//     c_sm = C * 2;
-//     f_sm = (threadId - base) * 2;
-//     r_gl = r / 2 + R;
-//     c_gl = c / 2 + C;
-//     f_gl = f / 2 + threadId - base;
-//     res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//       // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w,
-//       // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//       // r_sm, c_sm, f_sm);
-//     }
-//   }
-//   base += F; // ROUND_UP_WARP(F) * WARP_SIZE;
-//   // // load extra vertex
-//   if (dw && r + R * 2 == nr_p - 1 && c + C * 2 == nc_p - 1 &&
-//       f + F * 2 == nf_p - 1 && threadId >= base && threadId < base + 1) {
-//     r_sm = R * 2;
-//     c_sm = C * 2;
-//     f_sm = F * 2;
-//     r_gl = r / 2 + R;
-//     c_gl = c / 2 + C;
-//     f_gl = f / 2 + F;
-//     res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res;
-//       // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w,
-//       // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)],
-//       // r_sm, c_sm, f_sm);
-//     }
-//   }
-
-//   // asm volatile("membar.cta;");
-//   // start = clock64() - start;
-//   // printf("[store extra] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//   // blockIdx.y, blockIdx.x, start); start = clock64();
-
-//   // start = clock64();
-
-//   if (dwf && threadId >= R * C * F && threadId < R * C * F * 2) {
-//     r_sm = ((threadId - R * C * F) / (C * F)) * 2;
-//     c_sm = (((threadId - R * C * F) % (C * F)) / F) * 2;
-//     f_sm = (((threadId - R * C * F) % (C * F)) % F) * 2 + 1;
-//     r_gl = r / 2 + (threadId - R * C * F) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//       res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                  v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                  ratio_f_sm[f_sm - 1]);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//       dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//     }
-
-//     // if (nr == 70)
-//     // printf("f-store: (%d %d %d) <- %f (%d %d %d)\n", r_gl,
-//     // c_gl, f_gl, v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], r_sm, c_sm,
-//     // f_sm);
-//     // asm volatile("membar.cta;");
-//     // start = clock64() - start;
-//     // printf("[F-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//     // blockIdx.y, blockIdx.x, start); start = clock64();
-//   }
-//   // asm volatile("membar.cta;");
-//   // start = clock64() - start;
-//   // printf("[F-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//   // blockIdx.y, blockIdx.x, start); start = clock64();
-
-//   // if (r_sm % 2 == 0 && c_sm % 2 != 0 && f_sm % 2 == 0) {
-
-//   if (dwc && threadId >= R * C * F * 2 && threadId < R * C * F * 3) {
-//     r_sm = ((threadId - R * C * F * 2) / (C * F)) * 2;
-//     c_sm = (((threadId - R * C * F * 2) % (C * F)) / F) * 2 + 1;
-//     f_sm = (((threadId - R * C * F * 2) % (C * F)) % F) * 2;
-//     r_gl = r / 2 + (threadId - R * C * F * 2) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 2) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 2) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//       res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                  v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                  ratio_c_sm[c_sm - 1]);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//       dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//     }
-//   }
-
-//   // asm volatile("membar.cta;");
-//   // start = clock64() - start;
-//   // printf("[C-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//   // blockIdx.y, blockIdx.x, start); start = clock64();
-
-//   // if (r_sm % 2 != 0 && c_sm % 2 == 0 && f_sm % 2 == 0) {
-//   if (dwr && threadId >= R * C * F * 3 && threadId < R * C * F * 4) {
-//     r_sm = ((threadId - R * C * F * 3) / (C * F)) * 2 + 1;
-//     c_sm = (((threadId - R * C * F * 3) % (C * F)) / F) * 2;
-//     f_sm = (((threadId - R * C * F * 3) % (C * F)) % F) * 2;
-//     r_gl = r / 2 + (threadId - R * C * F * 3) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 3) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 3) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                  v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                  ratio_r_sm[r_sm - 1]);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//       dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//     }
-//   }
-
-//   // asm volatile("membar.cta;");
-//   // start = clock64() - start;
-//   // printf("[R-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//   // blockIdx.y, blockIdx.x, start); start = clock64();
-//   __syncthreads();
-//   if (dwcf && threadId >= R * C * F * 4 && threadId < R * C * F * 5) {
-//     r_sm = ((threadId - R * C * F * 4) / (C * F)) * 2;
-//     c_sm = (((threadId - R * C * F * 4) % (C * F)) / F) * 2 + 1;
-//     f_sm = (((threadId - R * C * F * 4) % (C * F)) % F) * 2 + 1;
-//     r_gl = r / 2 + (threadId - R * C * F * 4) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 4) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 4) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//       r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//       T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//       T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//       res = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//       dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res;
-//     }
-//   }
-
-//   // asm volatile("membar.cta;");
-//   // start = clock64() - start;
-//   // printf("[CF-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z,
-//   // blockIdx.y, blockIdx.x, start); start = clock64();
-
-//   if (dwrf && threadId >= R * C * F * 5 && threadId < R * C * F * 6) {
-//     r_sm = ((threadId - R * C * F * 5) / (C * F)) * 2 + 1;
-//     c_sm = (((threadId - R * C * F * 5) % (C * F)) / F) * 2;
-//     f_sm = (((threadId - R * C * F * 5) % (C * F)) % F) * 2 + 1;
-//     r_gl = r / 2 + (threadId - R * C * F * 5) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 5) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 5) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//       T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//       T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//       res = lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//       dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res;
-//     }
-//   }
-
-//   if (dwrc && threadId >= R * C * F * 6 && threadId < R * C * F * 7) {
-//     r_sm = ((threadId - R * C * F * 6) / (C * F)) * 2 + 1;
-//     c_sm = (((threadId - R * C * F * 6) % (C * F)) / F) * 2 + 1;
-//     f_sm = (((threadId - R * C * F * 6) % (C * F)) % F) * 2;
-//     r_gl = r / 2 + (threadId - R * C * F * 6) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 6) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 6) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//       T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)],
-//                   ratio_c_sm[c_sm - 1]);
-//       T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)],
-//                   ratio_c_sm[c_sm - 1]);
-//       res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//       dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res;
-//     }
-//   }
-
-//   if (dwrcf && threadId >= R * C * F * 7 && threadId < R * C * F * 8) {
-//     r_sm = ((threadId - R * C * F * 7) / (C * F)) * 2 + 1;
-//     c_sm = (((threadId - R * C * F * 7) % (C * F)) / F) * 2 + 1;
-//     f_sm = (((threadId - R * C * F * 7) % (C * F)) % F) * 2 + 1;
-//     r_gl = r / 2 + (threadId - R * C * F * 7) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 7) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 7) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//       T f1 = lerp(
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm - 1)],
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm + 1)],
-//           ratio_f_sm[f_sm - 1]);
-//       T f2 = lerp(
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm - 1)],
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm + 1)],
-//           ratio_f_sm[f_sm - 1]);
-//       T f3 = lerp(
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm - 1)],
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm + 1)],
-//           ratio_f_sm[f_sm - 1]);
-//       T f4 = lerp(
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm - 1)],
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm + 1)],
-//           ratio_f_sm[f_sm - 1]);
-
-//       T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//       T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]);
-
-//       res = lerp(fc1, fc2, ratio_r_sm[r_sm - 1]);
-//       res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//       dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)] = res;
-//     }
-//   }
-//   // end = clock64();
-
-//   // asm volatile("membar.cta;");
-//   // if (threadId < 256 && blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x
-//   ==
-//   // 0) printf("threadId %d elapsed %lu\n", threadId, end-start);
-//   if (r + R * 2 == nr_p - 1) {
-//     // printf("test\n");
-//     if (threadId < C * F) {
-//       // printf("test1\n");
-//       if (dwf) {
-//         // printf("test2\n");
-//         r_sm = R * 2;
-//         c_sm = (threadId / F) * 2;
-//         f_sm = (threadId % F) * 2 + 1;
-//         r_gl = r / 2 + R;
-//         c_gl = c / 2 + threadId / F;
-//         f_gl = f / 2 + threadId % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//           res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                      ratio_f_sm[f_sm - 1]);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//           // printf("dwf (%d %d %d): %f<-(%f %f %f)\n", r_gl, c_gl, f_gl,
-//           res,
-//           //   v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//           //                v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm +
-//           1)],
-//           //                ratio_f_sm[f_sm - 1]);
-//           dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-
-//       if (dwc) {
-//         r_sm = R * 2;
-//         c_sm = (threadId / F) * 2 + 1;
-//         f_sm = (threadId % F) * 2;
-//         r_gl = r / 2 + R;
-//         c_gl = c / 2 + threadId / F;
-//         f_gl = f / 2 + threadId % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//           res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                      ratio_c_sm[c_sm - 1]);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//           dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-
-//       if (dwcf) {
-//         r_sm = R * 2;
-//         c_sm = (threadId / F) * 2 + 1;
-//         f_sm = (threadId % F) * 2 + 1;
-//         r_gl = r / 2 + R;
-//         c_gl = c / 2 + threadId / F;
-//         f_gl = f / 2 + threadId % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//           T f1 = lerp(
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)],
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)],
-//               ratio_f_sm[f_sm - 1]);
-//           T f2 = lerp(
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)],
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)],
-//               ratio_f_sm[f_sm - 1]);
-//           res = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//           dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-//     }
-//   }
-
-//   if (c + C * 2 == nc_p - 1) {
-//     if (threadId >= R * C * F && threadId < R * C * F + R * F) {
-//       if (dwf) {
-//         r_sm = ((threadId - R * C * F) / F) * 2;
-//         c_sm = C * 2;
-//         f_sm = ((threadId - R * C * F) % F) * 2 + 1;
-//         r_gl = r / 2 + (threadId - R * C * F) / F;
-//         c_gl = c / 2 + C;
-//         f_gl = f / 2 + (threadId - R * C * F) % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//           res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                      ratio_f_sm[f_sm - 1]);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//           dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-
-//       if (dwr) {
-//         r_sm = ((threadId - R * C * F) / F) * 2 + 1;
-//         c_sm = C * 2;
-//         f_sm = ((threadId - R * C * F) % F) * 2;
-//         r_gl = r / 2 + (threadId - R * C * F) / F;
-//         c_gl = c / 2 + C;
-//         f_gl = f / 2 + (threadId - R * C * F) % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                      ratio_r_sm[r_sm - 1]);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//           dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-
-//       if (dwrf) {
-//         r_sm = ((threadId - R * C * F) / F) * 2 + 1;
-//         c_sm = C * 2;
-//         f_sm = ((threadId - R * C * F) % F) * 2 + 1;
-//         r_gl = r / 2 + (threadId - R * C * F) / F;
-//         c_gl = c / 2 + C;
-//         f_gl = f / 2 + (threadId - R * C * F) % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//           T f1 = lerp(
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)],
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)],
-//               ratio_f_sm[f_sm - 1]);
-//           T f2 = lerp(
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)],
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)],
-//               ratio_f_sm[f_sm - 1]);
-//           res = lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//           dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-//     }
-//   }
-
-//   if (f + F * 2 == nf_p - 1) {
-//     if (threadId >= R * C * F * 2 && threadId < R * C * F * 2 + R * C) {
-//       if (dwc) {
-//         r_sm = ((threadId - R * C * F * 2) / C) * 2;
-//         c_sm = ((threadId - R * C * F * 2) % C) * 2 + 1;
-//         f_sm = F * 2;
-//         r_gl = r / 2 + (threadId - R * C * F * 2) / C;
-//         c_gl = c / 2 + (threadId - R * C * F * 2) % C;
-//         f_gl = f / 2 + F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//           res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                      ratio_c_sm[c_sm - 1]);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//           dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-
-//       if (dwr) {
-//         r_sm = ((threadId - R * C * F * 2) / C) * 2 + 1;
-//         c_sm = ((threadId - R * C * F * 2) % C) * 2;
-//         f_sm = F * 2;
-//         r_gl = r / 2 + (threadId - R * C * F * 2) / C;
-//         c_gl = c / 2 + (threadId - R * C * F * 2) % C;
-//         f_gl = f / 2 + F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                      v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                      ratio_r_sm[r_sm - 1]);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//           dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-
-//       if (dwrc) {
-//         r_sm = ((threadId - R * C * F * 2) / C) * 2 + 1;
-//         c_sm = ((threadId - R * C * F * 2) % C) * 2 + 1;
-//         f_sm = F * 2;
-//         r_gl = r / 2 + (threadId - R * C * F * 2) / C;
-//         c_gl = c / 2 + (threadId - R * C * F * 2) % C;
-//         f_gl = f / 2 + F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//           T c1 = lerp(
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)],
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)],
-//               ratio_c_sm[c_sm - 1]);
-//           T c2 = lerp(
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)],
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)],
-//               ratio_c_sm[c_sm - 1]);
-//           res = lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//           res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//           dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res;
-//         }
-//       }
-//     }
-//   }
-
-//   if (dwr && c + C * 2 == nc_p - 1 && f + F * 2 == nf_p - 1) {
-//     if (threadId >= R * C * F * 3 && threadId < R * C * F * 3 + R) {
-//       r_sm = (threadId - R * C * F * 3) * 2 + 1;
-//       c_sm = C * 2;
-//       f_sm = F * 2;
-//       r_gl = r / 2 + threadId - R * C * F * 3;
-//       c_gl = c / 2 + C;
-//       f_gl = f / 2 + F;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                    v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                    ratio_r_sm[r_sm - 1]);
-//         res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//         dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res;
-//       }
-//     }
-//   }
-
-//   if (dwc && r + R * 2 == nr_p - 1 && f + F * 2 == nf_p - 1) {
-//     if (threadId >= R * C * F * 4 && threadId < R * C * F * 4 + C) {
-//       r_sm = R * 2;
-//       c_sm = (threadId - R * C * F * 4) * 2 + 1;
-//       f_sm = F * 2;
-//       r_gl = r / 2 + R;
-//       c_gl = c / 2 + threadId - R * C * F * 4;
-//       f_gl = f / 2 + F;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//         res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                    v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                    ratio_c_sm[c_sm - 1]);
-//         res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//         dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res;
-//       }
-//     }
-//   }
-
-//   if (dwf && r + R * 2 == nr_p - 1 && c + C * 2 == nc_p - 1) {
-//     if (threadId >= R * C * F * 5 && threadId < R * C * F * 5 + F) {
-//       r_sm = R * 2;
-//       c_sm = C * 2;
-//       f_sm = (threadId - R * C * F * 5) * 2 + 1;
-//       r_gl = r / 2 + R;
-//       c_gl = c / 2 + C;
-//       f_gl = f / 2 + threadId - R * C * F * 5;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//         res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                    v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                    ratio_f_sm[f_sm - 1]);
-//         res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res;
-//         dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res;
-//         // printf("dwf(%d %d %d): %f\n", r_gl, c_gl, f_gl,
-//         // dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]);
-//       }
-//     }
-//   }
-
-//   // if (r == 0 && c == 0 && f == 0 && threadId == 0) {
-//   //   printf("out config: %d %d %d (%d %d %d)\n", R, C, F, r,c,f);
-//   //   for (int i = 0; i < R * 2 + 1; i++) {
-//   //     for (int j = 0; j < C * 2 + 1; j++) {
-//   //       for (int k = 0; k < F * 2 + 1; k++) {
-//   //         printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   // }
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// MGARDX_KERL void
-// _gpk_reo_3d(SIZE nr, SIZE nc, SIZE nf,
-//             SIZE nr_c, SIZE nc_c, SIZE nf_c,
-//             T *dratio_r, T *dratio_c, T *dratio_f,
-//             T *dv, SIZE lddv1, SIZE lddv2,
-//             T *dw, SIZE lddw1, SIZE lddw2,
-//             T *dwf, SIZE lddwf1, SIZE lddwf2,
-//             T *dwc, SIZE lddwc1, SIZE lddwc2,
-//             T *dwr, SIZE lddwr1, SIZE lddwr2,
-//             T *dwcf, SIZE lddwcf1, SIZE lddwcf2,
-//             T *dwrf, SIZE lddwrf1, SIZE lddwrf2,
-//             T *dwrc, SIZE lddwrc1, SIZE lddwrc2,
-//             T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2) {
-
-//   __gpk_reo_3d<D, T, R, C, F>(gridDim.z, gridDim.y, gridDim.x, blockDim.z,
-//   blockDim.y, blockDim.x,
-//              blockIdx.z,  blockIdx.y,  blockIdx.x, threadIdx.z, threadIdx.y,
-//              threadIdx.x,
-//             nr, nc, nf, nr_c, nc_c, nf_c,
-//             dratio_r, dratio_c, dratio_f,
-//             dv, lddv1, lddv2,
-//             dw, lddw1, lddw2,
-//             dwf, lddwf1, lddwf2,
-//             dwc, lddwc1, lddwc2,
-//             dwr, lddwr1, lddwr2,
-//             dwcf, lddwcf1, lddwcf2,
-//             dwrf, lddwrf1, lddwrf2,
-//             dwrc, lddwrc1, lddwrc2,
-//             dwrcf, lddwrcf1, lddwrcf2);
-
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// void gpk_reo_3d_adaptive_launcher(
-//     Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r, T
-//     *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw, SIZE lddw1,
-//     SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2, T *dwc, SIZE lddwc1, SIZE
-//     lddwc2, T *dwr, SIZE lddwr1, SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE
-//     lddwcf2, T *dwrf, SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, SIZE
-//     lddwrc2, T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, int queue_idx) {
-
-//   SIZE nr_c = nr / 2 + 1;
-//   SIZE nc_c = nc / 2 + 1;
-//   SIZE nf_c = nf / 2 + 1;
-//   SIZE total_thread_z = std::max(nr - 1, (SIZE)1);
-//   SIZE total_thread_y = std::max(nc - 1, (SIZE)1);
-//   SIZE total_thread_x = std::max(nf - 1, (SIZE)1);
-
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-//   // const int R = 4;
-//   // const int C = 4;
-//   // const int F = 16;
-//   // tbz = std::min(R, total_thread_z);
-//   // tby = std::min(C, total_thread_y);
-//   // tbx = std::min(F, total_thread_x);
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = ((R + 1) * (C + 1) * (F + 1) + R + C + F) * sizeof(T);
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-//   // printf("exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz, gridx,
-//   gridy,
-//   // gridz);
-//   _gpk_reo_3d<D, T, R / 2, C / 2, F / 2>
-//       <<<blockPerGrid, threadsPerBlock, sm_size,
-//          *(cudaStream_t *)handle.get(queue_idx)>>>(
-//           nr, nc, nf, nr_c, nc_c, nf_c, dratio_r, dratio_c, dratio_f, dv,
-//           lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1,
-//           lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1,
-//           lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void gpk_reo_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r,
-//                 T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T
-//                 *dw, SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE
-//                 lddwf2, T *dwc, SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE
-//                 lddwr1, SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE lddwcf2, T
-//                 *dwrf, SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1,
-//                 SIZE lddwrc2, T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, int
-//                 queue_idx, int config) {
-
-//   #define GPK(R, C, F)                                                           \
-//   {                                                                            \
-//     gpk_reo_3d_adaptive_launcher<D, T, R, C, F>(                               \
-//         handle, nr, nc, nf, dratio_r, dratio_c, dratio_f, dv, lddv1, lddv2,    \
-//         dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1, lddwc2, dwr,       \
-//         lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1, lddwrf2, dwrc,  \
-//         lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2, queue_idx);               \
-//   }
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D == 3) {
-//     if (profile || config == 6) {
-//       GPK(2, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(2, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(4, 4, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(4, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(4, 4, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       GPK(2, 2, 2)
-//     }
-//     // PI_QL(T, 4, 4, 4)
-//   } else if (D == 2) {
-//     if (profile || config == 6) {
-//       GPK(1, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(1, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(1, 4, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(1, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(1, 4, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(1, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       GPK(1, 2, 4)
-//     }
-//     // PI_QL(T, 1, 4, 4)
-//   } else if (D == 1) {
-//     if (profile || config == 6) {
-//       GPK(1, 1, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(1, 1, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(1, 1, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(1, 1, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(1, 1, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(1, 1, 8)
-//     }
-//     if (profile || config == 0) {
-//       GPK(1, 1, 8)
-//     }
-//   }
-//   #undef GPK
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// __global__ void
-// _gpk_rev_3d(SIZE nr, SIZE nc, SIZE nf, SIZE nr_c, SIZE nc_c, SIZE nf_c, T
-// *dratio_r,
-//             T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw,
-//             SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2, T *dwc,
-//             SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE lddwr1, SIZE lddwr2, T
-//             *dwcf, SIZE lddwcf1, SIZE lddwcf2, T *dwrf, SIZE lddwrf1, SIZE
-//             lddwrf2, T *dwrc, SIZE lddwrc1, SIZE lddwrc2, T *dwrcf, SIZE
-//             lddwrcf1, SIZE lddwrcf2, SIZE svr, SIZE svc, SIZE svf, SIZE nvr,
-//             SIZE nvc, SIZE nvf) {
-
-//   //to be removed
-//   // int TYPE = 1;
-//   // bool INTERPOLATION = true;
-//   // bool COEFF_RESTORE = true;
-//   // int in_next = false;
-//   // int skip = false;
-
-//   SIZE r = blockIdx.z * blockDim.z;
-//   SIZE c = blockIdx.y * blockDim.y;
-//   SIZE f = blockIdx.x * blockDim.x;
-
-//   SIZE r_sm = threadIdx.z;
-//   SIZE c_sm = threadIdx.y;
-//   SIZE f_sm = threadIdx.x;
-
-//   SIZE r_sm_ex = R * 2;
-//   SIZE c_sm_ex = C * 2;
-//   SIZE f_sm_ex = F * 2;
-
-//   SIZE r_gl;
-//   SIZE c_gl;
-//   SIZE f_gl;
-
-//   SIZE r_gl_ex;
-//   SIZE c_gl_ex;
-//   SIZE f_gl_ex;
-
-//   T res;
-
-//   LENGTH threadId;
-
-//   T *sm = SharedMemory<T>();
-
-//   // extern __shared__ double sm[]; // size: (blockDim.x + 1) * (blockDim.y +
-//   1)
-//   // * (blockDim.z + 1)
-//   SIZE ldsm1 = F * 2 + 1;
-//   SIZE ldsm2 = C * 2 + 1;
-//   T *v_sm = sm;
-//   T *ratio_f_sm = sm + (F * 2 + 1) * (C * 2 + 1) * (R * 2 + 1);
-//   T *ratio_c_sm = ratio_f_sm + F * 2;
-//   T *ratio_r_sm = ratio_c_sm + C * 2;
-
-//   SIZE rest_r = nr - r;
-//   SIZE rest_c = nc - c;
-//   SIZE rest_f = nf - f;
-
-//   SIZE nr_p = nr;
-//   SIZE nc_p = nc;
-//   SIZE nf_p = nf;
-
-//   SIZE rest_r_p;
-//   SIZE rest_c_p;
-//   SIZE rest_f_p;
-
-//   rest_r_p = rest_r;
-//   rest_c_p = rest_c;
-//   rest_f_p = rest_f;
-
-//   threadId = (threadIdx.z * (blockDim.x * blockDim.y)) +
-//                  (threadIdx.y * blockDim.x) + threadIdx.x;
-
-//   // extern __shared__ __align__(sizeof(T)) unsigned char smem[];
-//   // T * sm = reinterpret_cast<T *>(smem);
-
-//   // // load dist
-//   // if (c_sm == 0 && f_sm == 0 && r + r_sm < nr) {
-//   //   ratio_r_sm[r_sm] = dratio_r[r + r_sm];
-//   // }
-//   // if (r_sm == 0 && f_sm == 0 && c + c_sm < nc) {
-//   //   ratio_c_sm[c_sm] = dratio_c[c + c_sm];
-//   // }
-//   // if (c_sm == 0 && r_sm == 0 && f + f_sm < nf) {
-//   //   ratio_f_sm[f_sm] = dratio_f[f + f_sm];
-//   // }
-
-//   if (nr % 2 == 0) {
-//     nr_p = nr + 1;
-//     rest_r_p = nr_p - r;
-//   }
-//   if (nc % 2 == 0) {
-//     nc_p = nc + 1;
-//     rest_c_p = nc_p - c;
-//   }
-//   if (nf % 2 == 0) {
-//     nf_p = nf + 1;
-//     rest_f_p = nf_p - f;
-//   }
-
-//   // load dist
-//   if (c_sm == 0 && f_sm == 0 && r_sm < rest_r - 2) {
-//     ratio_r_sm[r_sm] = dratio_r[r + r_sm];
-//     if (nr % 2 == 0 && R * 2 + 1 >= rest_r_p && r_sm == 0) {
-//       ratio_r_sm[rest_r_p - 3] = 0.5;
-//     }
-//   }
-//   if (r_sm == 0 && f_sm == 0 && c_sm < rest_c - 2) {
-//     ratio_c_sm[c_sm] = dratio_c[c + c_sm];
-//     if (nc % 2 == 0 && C * 2 + 1 >= rest_c_p && c_sm == 0) {
-//       ratio_c_sm[rest_c_p - 3] = 0.5;
-//     }
-//   }
-//   if (c_sm == 0 && r_sm == 0 && f_sm < rest_f - 2) {
-//     ratio_f_sm[f_sm] = dratio_f[f + f_sm];
-//     if (nf % 2 == 0 && F * 2 + 1 >= rest_f_p && f_sm == 0) {
-//       ratio_f_sm[rest_f_p - 3] = 0.5;
-//     }
-//   }
-
-//   // if (threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.x == 0) {
-//   //   for (int i = 0; i < R * 2 + 1; i++) {
-//   //     for (int j = 0; j < C * 2 + 1; j++) {
-//   //       for (int k = 0; k < F * 2 + 1; k++) {
-//   //         v_sm[get_idx(ldsm1, ldsm2, i, j, k)] = 71177117;
-//   //       }
-//   //     }
-//   //   }
-//   // }
-
-//   __syncthreads();
-
-//   if (dw && threadId < R * C * F) {
-//     r_sm = (threadId / (C * F)) * 2;
-//     c_sm = ((threadId % (C * F)) / F) * 2;
-//     f_sm = ((threadId % (C * F)) % F) * 2;
-//     r_gl = r / 2 + threadId / (C * F);
-//     c_gl = c / 2 + threadId % (C * F) / F;
-//     f_gl = f / 2 + threadId % (C * F) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//             dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//         // if (c_gl == nc_c - 1 && f_gl == nf_c-1)
-//         // printf("block: (%d %d %d) thread: (%d %d %d) load0 (%d %d %d): %f
-//         // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//         // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//         //               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)],
-//         //                 r_gl, c_gl, f_gl);
-//     }
-
-//   }
-
-//   int base = 0;
-//   if (dw && threadId >= base && threadId < base + C * F) {
-//     r_sm = R * 2;
-//     c_sm = ((threadId - base) / F) * 2;
-//     f_sm = ((threadId - base) % F) * 2;
-//     r_gl = r / 2 + R;
-//     c_gl = c / 2 + (threadId - base) / F;
-//     f_gl = f / 2 + (threadId - base) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//       // if (c_gl == nc_c - 1 && f_gl == nf_c-1)
-//       // printf("block: (%d %d %d) thread: (%d %d %d) load1 (%d %d %d): %f
-//       // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//       // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//       //               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)],
-//       //                 r_gl, c_gl, f_gl);
-//     }
-//   }
-//   base += C * F; // ROUND_UP_WARP(C * F) * WARP_SIZE;
-//   if (dw && threadId >= base && threadId < base + R * F) {
-//     r_sm = ((threadId - base) / F) * 2;
-//     c_sm = C * 2;
-//     f_sm = ((threadId - base) % F) * 2;
-//     r_gl = r / 2 + (threadId - base) / F;
-//     c_gl = c / 2 + C;
-//     f_gl = f / 2 + (threadId - base) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//       // if (c_gl == nc_c - 1 && f_gl == nf_c-1)
-//       // printf("block: (%d %d %d) thread: (%d %d %d) load2 (%d %d %d): %f
-//       // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//       // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//       //               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)],
-//       //                 r_gl, c_gl, f_gl);
-//     }
-//   }
-//   base += R * F; // ROUND_UP_WARP(R * F) * WARP_SIZE;
-//   if (dw && threadId >= base && threadId < base + R * C) {
-//     r_sm = ((threadId - base) / C) * 2;
-//     c_sm = ((threadId - base) % C) * 2;
-//     f_sm = F * 2;
-//     r_gl = r / 2 + (threadId - base) / C;
-//     c_gl = c / 2 + (threadId - base) % C;
-//     f_gl = f / 2 + F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//       // if (c_gl == nc_c - 1 && f_gl == nf_c-1)
-//       // printf("block: (%d %d %d) thread: (%d %d %d) load3 (%d %d %d): %f
-//       // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//       // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//       //               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)],
-//       //                 r_gl, c_gl, f_gl);
-//     }
-//   }
-//   base += R * C; // ROUND_UP_WARP(R * C) * WARP_SIZE;
-//   // load extra edges
-//   if (dw && threadId >= base && threadId < base + R) {
-//     r_sm = (threadId - base) * 2;
-//     c_sm = C * 2;
-//     f_sm = F * 2;
-//     r_gl = r / 2 + threadId - base;
-//     c_gl = c / 2 + C;
-//     f_gl = f / 2 + F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//       // if (c_gl == nc_c - 1 && f_gl == nf_c-1)
-//       // printf("block: (%d %d %d) thread: (%d %d %d) load4 (%d %d %d): %f
-//       // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//       // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//       //               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)],
-//       //                 r_gl, c_gl, f_gl);
-//     }
-//   }
-//   base += R; // ROUND_UP_WARP(R) * WARP_SIZE;
-//   if (dw && threadId >= base && threadId < base + C) {
-//     r_sm = R * 2;
-//     c_sm = (threadId - base) * 2;
-//     f_sm = F * 2;
-//     r_gl = r / 2 + R;
-//     c_gl = c / 2 + threadId - base;
-//     f_gl = f / 2 + F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//       // if (c_gl == nc_c - 1 && f_gl == nf_c-1)
-//       // printf("block: (%d %d %d) thread: (%d %d %d) load5 (%d %d %d): %f
-//       // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//       // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//       //               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)],
-//       //                 r_gl, c_gl, f_gl);
-//     }
-//   }
-//   base += C; // ROUND_UP_WARP(C) * WARP_SIZE;
-//   if (dw && threadId >= base && threadId < base + F) {
-//     r_sm = R * 2;
-//     c_sm = C * 2;
-//     f_sm = (threadId - base) * 2;
-//     r_gl = r / 2 + R;
-//     c_gl = c / 2 + C;
-//     f_gl = f / 2 + threadId - base;
-//       if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//           r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//       // if (c_gl == nc_c - 1 && f_gl == nf_c-1)
-//       // printf("block: (%d %d %d) thread: (%d %d %d) load6 (%d %d %d): %f
-//       // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//       // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//       //               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)],
-//       //                 r_gl, c_gl, f_gl);
-//     }
-//   }
-//   base += F; // ROUND_UP_WARP(F) * WARP_SIZE;
-//   // // load extra vertex
-//   if (dw && threadId >= base && threadId < base + 1) {
-//     r_sm = R * 2;
-//     c_sm = C * 2;
-//     f_sm = F * 2;
-//     r_gl = r / 2 + R;
-//     c_gl = c / 2 + C;
-//     f_gl = f / 2 + F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] =
-//           dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)];
-//       // if (c_gl == nc_c - 1 && f_gl == nf_c-1)
-//       // printf("block: (%d %d %d) thread: (%d %d %d) load7 (%d %d %d): %f
-//       // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//       // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//       //               v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)],
-//       //                 r_gl, c_gl, f_gl);
-//     }
-//   }
-
-//   __syncthreads();
-
-//   // __syncthreads();
-//   // if (threadIdx.x == 0 && threadIdx.y == 0&& threadIdx.z == 0) {
-//   //   printf("rest_p: %u %u %u RCF\n", rest_r_p, rest_c_p, rest_f_p, R, C,
-//   F);
-//   //   for (int i = 0; i < min(rest_r_p, R * 2 + 1); i++) {
-//   //     for (int j = 0; j < min(rest_c_p, C * 2 + 1); j++) {
-//   //       for (int k = 0; k < min(rest_f_p, F * 2 + 1); k++) {
-//   //         printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   // }
-//   // __syncthreads();
-
-//   // __syncthreads();
-//   // if (debug) {
-//   //   printf("TYPE: %d %d %d %d\n", TYPE,
-//   //           min(rest_r_p, R * 2 + 1),
-//   //           min(rest_c_p, C * 2 + 1),
-//   //           min(rest_f_p, F * 2 + 1));
-//   //   for (int i = 0; i < min(rest_r_p, R * 2 + 1); i++) {
-//   //     for (int j = 0; j < min(rest_c_p, C * 2 + 1); j++) {
-//   //       for (int k = 0; k < min(rest_f_p, F * 2 + 1); k++) {
-//   //         printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   // }
-//   // __syncthreads();
-
-//   if (dwf && threadId >= R * C * F && threadId < R * C * F * 2) {
-
-//     r_sm = ((threadId - R * C * F) / (C * F)) * 2;
-//     c_sm = (((threadId - R * C * F) % (C * F)) / F) * 2;
-//     f_sm = (((threadId - R * C * F) % (C * F)) % F) * 2 + 1;
-//     r_gl = r / 2 + (threadId - R * C * F) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F) % (C * F)) % F;
-
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-
-//       res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//       res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//     }
-
-//   }
-
-//   if (dwc && threadId >= R * C * F * 2 && threadId < R * C * F * 3) {
-//     r_sm = ((threadId - R * C * F * 2) / (C * F)) * 2;
-//     c_sm = (((threadId - R * C * F * 2) % (C * F)) / F) * 2 + 1;
-//     f_sm = (((threadId - R * C * F * 2) % (C * F)) % F) * 2;
-//     r_gl = r / 2 + (threadId - R * C * F * 2) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 2) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 2) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//       res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//       res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                   ratio_c_sm[c_sm - 1]);
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//     }
-//   }
-
-//   if (dwr && threadId >= R * C * F * 3 && threadId < R * C * F * 4) {
-//     r_sm = ((threadId - R * C * F * 3) / (C * F)) * 2 + 1;
-//     c_sm = (((threadId - R * C * F * 3) % (C * F)) / F) * 2;
-//     f_sm = (((threadId - R * C * F * 3) % (C * F)) % F) * 2;
-//     r_gl = r / 2 + (threadId - R * C * F * 3) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 3) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 3) % (C * F)) % F;
-
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//       res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//       res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                   ratio_r_sm[r_sm - 1]);
-//       // if (c_gl == nc_c-1 && f_gl == nf_c - 1)
-//       //     printf("block: (%d %d %d) thread: (%d %d %d) calc_coeff0 (%d
-//       //     %d %d): %f <- %f %f\n", blockIdx.z, blockIdx.y, blockIdx.x,
-//       //     threadIdx.z, threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//       //             res, v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//       //             f_sm)],
-//       //               v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)]);
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//     }
-//   }
-
-//   if (dwcf && threadId >= R * C * F * 4 && threadId < R * C * F * 5) {
-//     r_sm = ((threadId - R * C * F * 4) / (C * F)) * 2;
-//     c_sm = (((threadId - R * C * F * 4) % (C * F)) / F) * 2 + 1;
-//     f_sm = (((threadId - R * C * F * 4) % (C * F)) % F) * 2 + 1;
-//     r_gl = r / 2 + (threadId - R * C * F * 4) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 4) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 4) % (C * F)) % F;
-
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//       res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)];
-//       T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//       T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//       res += lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//     }
-//   }
-
-//   if (dwrf && threadId >= R * C * F * 5 && threadId < R * C * F * 6) {
-//     r_sm = ((threadId - R * C * F * 5) / (C * F)) * 2 + 1;
-//     c_sm = (((threadId - R * C * F * 5) % (C * F)) / F) * 2;
-//     f_sm = (((threadId - R * C * F * 5) % (C * F)) % F) * 2 + 1;
-//     r_gl = r / 2 + (threadId - R * C * F * 5) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 5) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 5) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//       res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//       T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//       T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)],
-//                   ratio_f_sm[f_sm - 1]);
-//         res += lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//     }
-//   }
-
-//   if (dwrc && threadId >= R * C * F * 6 && threadId < R * C * F * 7) {
-//     r_sm = ((threadId - R * C * F * 6) / (C * F)) * 2 + 1;
-//     c_sm = (((threadId - R * C * F * 6) % (C * F)) / F) * 2 + 1;
-//     f_sm = (((threadId - R * C * F * 6) % (C * F)) % F) * 2;
-//     r_gl = r / 2 + (threadId - R * C * F * 6) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 6) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 6) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//       res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//       T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)],
-//                   ratio_c_sm[c_sm - 1]);
-//       T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)],
-//                   v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)],
-//                   ratio_c_sm[c_sm - 1]);
-//       res += lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//     }
-//   }
-
-//   if (dwrcf && threadId >= R * C * F * 7 && threadId < R * C * F * 8) {
-//     r_sm = ((threadId - R * C * F * 7) / (C * F)) * 2 + 1;
-//     c_sm = (((threadId - R * C * F * 7) % (C * F)) / F) * 2 + 1;
-//     f_sm = (((threadId - R * C * F * 7) % (C * F)) % F) * 2 + 1;
-//     r_gl = r / 2 + (threadId - R * C * F * 7) / (C * F);
-//     c_gl = c / 2 + ((threadId - R * C * F * 7) % (C * F)) / F;
-//     f_gl = f / 2 + ((threadId - R * C * F * 7) % (C * F)) % F;
-//     if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//         r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//       res = dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)];
-//       T f1 =
-//           lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm - 1)],
-//                v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm + 1)],
-//                ratio_f_sm[f_sm - 1]);
-//       T f2 =
-//           lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm - 1)],
-//                v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm + 1)],
-//                ratio_f_sm[f_sm - 1]);
-//       T f3 =
-//           lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm - 1)],
-//                v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm + 1)],
-//                ratio_f_sm[f_sm - 1]);
-//       T f4 =
-//           lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm - 1)],
-//                v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm + 1)],
-//                ratio_f_sm[f_sm - 1]);
-
-//       T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//       T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]);
-
-//       res += lerp(fc1, fc2, ratio_r_sm[r_sm - 1]);
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//     }
-//   }
-
-//   if (r + R * 2 == nr_p - 1) {
-//     if (threadId < C * F) {
-//       if (dwf) {
-//         r_sm = R * 2;
-//         c_sm = (threadId / F) * 2;
-//         f_sm = (threadId % F) * 2 + 1;
-//         r_gl = r / 2 + R;
-//         c_gl = c / 2 + threadId / F;
-//         f_gl = f / 2 + threadId % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//           res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//           res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                       ratio_f_sm[f_sm - 1]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-
-//       if (dwc) {
-//         r_sm = R * 2;
-//         c_sm = (threadId / F) * 2 + 1;
-//         f_sm = (threadId % F) * 2;
-//         r_gl = r / 2 + R;
-//         c_gl = c / 2 + threadId / F;
-//         f_gl = f / 2 + threadId % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//           res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//           res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                       ratio_c_sm[c_sm - 1]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-//       if (dwcf) {
-//         r_sm = R * 2;
-//         c_sm = (threadId / F) * 2 + 1;
-//         f_sm = (threadId % F) * 2 + 1;
-//         r_gl = r / 2 + R;
-//         c_gl = c / 2 + threadId / F;
-//         f_gl = f / 2 + threadId % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) {
-//           res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)];
-//           T f1 =
-//               lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)],
-//                    v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)],
-//                    ratio_f_sm[f_sm - 1]);
-//           T f2 =
-//               lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)],
-//                    v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)],
-//                    ratio_f_sm[f_sm - 1]);
-//           res += lerp(f1, f2, ratio_c_sm[c_sm - 1]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-//     }
-//   }
-
-//   if (c + C * 2 == nc_p - 1) {
-//     if (threadId >= R * C * F && threadId < R * C * F + R * F) {
-//       if (dwf) {
-//         r_sm = ((threadId - R * C * F) / F) * 2;
-//         c_sm = C * 2;
-//         f_sm = ((threadId - R * C * F) % F) * 2 + 1;
-//         r_gl = r / 2 + (threadId - R * C * F) / F;
-//         c_gl = c / 2 + C;
-//         f_gl = f / 2 + (threadId - R * C * F) % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//           res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//           res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                       ratio_f_sm[f_sm - 1]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-//       if (dwr) {
-//         r_sm = ((threadId - R * C * F) / F) * 2 + 1;
-//         c_sm = C * 2;
-//         f_sm = ((threadId - R * C * F) % F) * 2;
-//         r_gl = r / 2 + (threadId - R * C * F) / F;
-//         c_gl = c / 2 + C;
-//         f_gl = f / 2 + (threadId - R * C * F) % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//           res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                       ratio_r_sm[r_sm - 1]);
-//           // if (c_gl == nc_c-1 && f_gl == nf_c - 1)
-//           // printf("block: (%d %d %d) thread: (%d %d %d) calc_coeff1 (%d
-//           // %d %d): %f <- %f %f\n", blockIdx.z, blockIdx.y, blockIdx.x,
-//           // threadIdx.z, threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//           //         res, v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//           //         f_sm)],
-//           //           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-//       if (dwrf) {
-//         r_sm = ((threadId - R * C * F) / F) * 2 + 1;
-//         c_sm = C * 2;
-//         f_sm = ((threadId - R * C * F) % F) * 2 + 1;
-//         r_gl = r / 2 + (threadId - R * C * F) / F;
-//         c_gl = c / 2 + C;
-//         f_gl = f / 2 + (threadId - R * C * F) % F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//           res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)];
-//           T f1 =
-//               lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)],
-//                    v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)],
-//                    ratio_f_sm[f_sm - 1]);
-//           T f2 =
-//               lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)],
-//                    v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)],
-//                    ratio_f_sm[f_sm - 1]);
-//           res += lerp(f1, f2, ratio_r_sm[r_sm - 1]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-//     }
-//   }
-
-//   if (f + F * 2 == nf_p - 1) {
-//     if (threadId >= R * C * F * 2 && threadId < R * C * F * 2 + R * C) {
-//       if (dwc) {
-//         r_sm = ((threadId - R * C * F * 2) / C) * 2;
-//         c_sm = ((threadId - R * C * F * 2) % C) * 2 + 1;
-//         f_sm = F * 2;
-//         r_gl = r / 2 + (threadId - R * C * F * 2) / C;
-//         c_gl = c / 2 + (threadId - R * C * F * 2) % C;
-//         f_gl = f / 2 + F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//           res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//           res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                       ratio_c_sm[c_sm - 1]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-
-//       if (dwr) {
-//         r_sm = ((threadId - R * C * F * 2) / C) * 2 + 1;
-//         c_sm = ((threadId - R * C * F * 2) % C) * 2;
-//         f_sm = F * 2;
-//         r_gl = r / 2 + (threadId - R * C * F * 2) / C;
-//         c_gl = c / 2 + (threadId - R * C * F * 2) % C;
-//         f_gl = f / 2 + F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//           res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                       ratio_r_sm[r_sm - 1]);
-//           // if (c_gl == nc_c-1 && f_gl == nf_c - 1)
-//           // printf("block: (%d %d %d) thread: (%d %d %d) calc_coeff2 (%d
-//           // %d %d): %f <- %f %f\n", blockIdx.z, blockIdx.y, blockIdx.x,
-//           // threadIdx.z, threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//           //         res, v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//           //         f_sm)],
-//           //           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-
-//       if (dwrc) {
-//         r_sm = ((threadId - R * C * F * 2) / C) * 2 + 1;
-//         c_sm = ((threadId - R * C * F * 2) % C) * 2 + 1;
-//         f_sm = F * 2;
-//         r_gl = r / 2 + (threadId - R * C * F * 2) / C;
-//         c_gl = c / 2 + (threadId - R * C * F * 2) % C;
-//         f_gl = f / 2 + F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//           res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)];
-//           T c1 =
-//               lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)],
-//                    v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)],
-//                    ratio_c_sm[c_sm - 1]);
-//           T c2 =
-//               lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)],
-//                    v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)],
-//                    ratio_c_sm[c_sm - 1]);
-//           res += lerp(c1, c2, ratio_r_sm[r_sm - 1]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-//     }
-//   }
-
-//   if (c + C * 2 == nc_p - 1 && f + F * 2 == nf_p - 1) {
-//     if (threadId >= R * C * F * 3 && threadId < R * C * F * 3 + R) {
-//       if (dwr) {
-//         r_sm = (threadId - R * C * F * 3) * 2 + 1;
-//         c_sm = C * 2;
-//         f_sm = F * 2;
-//         r_gl = r / 2 + threadId - R * C * F * 3;
-//         c_gl = c / 2 + C;
-//         f_gl = f / 2 + F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) {
-//           res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)];
-//           res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)],
-//                       ratio_r_sm[r_sm - 1]);
-//           // if (c_gl == nc_c-1 && f_gl == nf_c - 1)
-//           // printf("block: (%d %d %d) thread: (%d %d %d) calc_coeff3 (%d
-//           // %d %d): %f <- %f %f\n", blockIdx.z, blockIdx.y, blockIdx.x,
-//           // threadIdx.z, threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm,
-//           //         res, v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm,
-//           //         f_sm)],
-//           //           v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-//     }
-//   }
-
-//   if (r + R * 2 == nr_p - 1 && f + F * 2 == nf_p - 1) {
-//     if (threadId >= R * C * F * 4 && threadId < R * C * F * 4 + C) {
-//       if (dwc) {
-//         r_sm = R * 2;
-//         c_sm = (threadId - R * C * F * 4) * 2 + 1;
-//         f_sm = F * 2;
-//         r_gl = r / 2 + R;
-//         c_gl = c / 2 + threadId - R * C * F * 4;
-//         f_gl = f / 2 + F;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) {
-//           res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)];
-//           res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)],
-//                       ratio_c_sm[c_sm - 1]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-//     }
-//   }
-
-//   if (r + R * 2 == nr_p - 1 && c + C * 2 == nc_p - 1) {
-//     if (threadId >= R * C * F * 5 && threadId < R * C * F * 5 + F) {
-//       if (dwf) {
-//         r_sm = R * 2;
-//         c_sm = C * 2;
-//         f_sm = (threadId - R * C * F * 5) * 2 + 1;
-//         r_gl = r / 2 + R;
-//         c_gl = c / 2 + C;
-//         f_gl = f / 2 + threadId - R * C * F * 5;
-//         if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p &&
-//             r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) {
-//           res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)];
-//           res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)],
-//                       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)],
-//                       ratio_f_sm[f_sm - 1]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res;
-//         }
-//       }
-//     }
-//   }
-
-//   // __syncthreads();
-//   // if (debug) {
-//   //   printf("TYPE: %d %d %d %d\n", TYPE,
-//   //           min(rest_r_p, R * 2 + 1),
-//   //           min(rest_c_p, C * 2 + 1),
-//   //           min(rest_f_p, F * 2 + 1));
-//   //   for (int i = 0; i < min(rest_r_p, R * 2 + 1); i++) {
-//   //     for (int j = 0; j < min(rest_c_p, C * 2 + 1); j++) {
-//   //       for (int k = 0; k < min(rest_f_p, F * 2 + 1); k++) {
-//   //         printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   // }
-//   // __syncthreads();
-
-//   __syncthreads();
-
-//   r_sm = threadIdx.z;
-//   c_sm = threadIdx.y;
-//   f_sm = threadIdx.x;
-
-//   r_sm_ex = blockDim.z;
-//   c_sm_ex = blockDim.y;
-//   f_sm_ex = blockDim.x;
-
-//   r_gl = r + r_sm;
-//   c_gl = c + c_sm;
-//   f_gl = f + f_sm;
-
-//   // r_gl_ex = r + R * 2;
-//   // c_gl_ex = c + C * 2;
-//   // f_gl_ex = f + F * 2;
-
-//   r_gl_ex = r + rest_r - 1;
-//   c_gl_ex = c + rest_c - 1;
-//   f_gl_ex = f + rest_f - 1;
-
-//   int unpadding_r = rest_r;
-//   int unpadding_c = rest_c;
-//   int unpadding_f = rest_f;
-//   if (nr % 2 == 0)
-//     unpadding_r -= 1;
-//   if (nc % 2 == 0)
-//     unpadding_c -= 1;
-//   if (nf % 2 == 0)
-//     unpadding_f -= 1;
-
-//   if (r_sm < unpadding_r && c_sm < unpadding_c && f_sm < unpadding_f) {
-
-//     // store extra rules
-//     // case 1: input = odd (non-padding required)
-//     //    case 1.a: block size + 1 == rest (need to store extra);
-//     //    case 1.b: block size + 1 != rest (No need to store extra);
-//     // case 2: input = even (un-padding requried)
-//     //    case 2.a: block size + 1 >= rest (No need to store extra, but need
-//     //    un-padding first); case 2.b: block size + 1 < rest (No need to
-//     store
-//     //    extra);
-
-//     if (D >= 3 && r_sm == 0) {
-//       if (nr % 2 != 0 && R * 2 + 1 == rest_r) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)];
-//       }
-//       if (nr % 2 == 0 && R * 2 + 1 >= rest_r_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)];
-//         // if ( v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, c_sm, f_sm)] ==
-//         71177117)
-//         // printf("un-padding0 error block: (%d %d %d) thread: (%d %d %d)
-//         // un-padding (%d %d %d) %f (%d %d %d)\n", blockIdx.z, blockIdx.y,
-//         // blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x,
-//         //   rest_r-1, c_sm, f_sm,
-//         //     v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, c_sm, f_sm)],
-//         rest_r_p-1,
-//         //     c_sm, f_sm);
-//       }
-//     }
-
-//     if (D >= 2 && c_sm == 0) {
-//       if (nc % 2 != 0 && C * 2 + 1 == rest_c) {
-//         dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)];
-//       }
-//       if (nc % 2 == 0 && C * 2 + 1 >= rest_c_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)];
-//         // if (v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)] ==
-//         71177117)
-//         //   printf("un-padding1 error block: (%d %d %d) thread: (%d %d %d) "
-//         //          "un-padding (%d %d %d) %f (%d %d %d)\n",
-//         //          blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//         threadIdx.y,
-//         //          threadIdx.x, r_sm, rest_c - 1, f_sm,
-//         //          v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)],
-//         r_sm,
-//         //          rest_c_p - 1, f_sm);
-//       }
-//     }
-
-//     if (D >= 1 && f_sm == 0) {
-//       if (nf % 2 != 0 && F * 2 + 1 == rest_f) {
-//         dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)];
-//       }
-//       if (nf % 2 == 0 && F * 2 + 1 >= rest_f_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p - 1)];
-//         // if ( v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p-1)] ==
-//         71177117)
-//         // printf("un-padding2 error block: (%d %d %d) thread: (%d %d %d)
-//         // un-padding (%d %d %d) %f (%d %d %d)\n", blockIdx.z, blockIdx.y,
-//         // blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x,
-//         //   r_sm, c_sm, rest_f-1,
-//         //     v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p-1)], r_sm,
-//         c_sm,
-//         //     rest_f_p-1);
-//       }
-//     }
-
-//     // load extra edges
-//     if (D >= 2 && c_sm == 0 && f_sm == 0) {
-//       if (nc % 2 != 0 && C * 2 + 1 == rest_c && nf % 2 != 0 &&
-//           F * 2 + 1 == rest_f) {
-//         dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)];
-//       }
-//       if (nc % 2 == 0 && nf % 2 == 0 && C * 2 + 1 >= rest_c_p &&
-//           F * 2 + 1 >= rest_f_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, rest_f - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, rest_f_p - 1)];
-//         // printf("block: (%d %d %d) thread: (%d %d %d) un-padding (%d %d %d)
-//         %f
-//         // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//         // threadIdx.y, threadIdx.x, r_sm, rest_c-1, rest_f-1,
-//         //     v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c-1, rest_f-1)], r_sm,
-//         //     rest_c_p-1, rest_f_p-1);
-//       }
-//       if (nc % 2 == 0 && nf % 2 != 0 && C * 2 + 1 >= rest_c_p &&
-//           F * 2 + 1 == rest_f) {
-//         dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)];
-//       }
-//       if (nc % 2 != 0 && nf % 2 == 0 && C * 2 + 1 == rest_c &&
-//           F * 2 + 1 >= rest_f_p) {
-//         dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)];
-//         // printf("(%d %d %d): %f <- (%d %d %d)\n",
-//         //         r_gl, c_gl_ex, f_gl_ex,
-//         //         dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)],
-//         //         r_sm, c_sm_ex, f_gl_ex);
-//       }
-//     }
-
-//     if (D >= 3 && r_sm == 0 && f_sm == 0) {
-//       if (nr % 2 != 0 && R * 2 + 1 == rest_r && nf % 2 != 0 &&
-//           F * 2 + 1 == rest_f) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)];
-//       }
-//       if (nr % 2 == 0 && nf % 2 == 0 && R * 2 + 1 >= rest_r_p &&
-//           F * 2 + 1 >= rest_f_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, rest_f - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, rest_f_p - 1)];
-//         // if ( v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, c_sm, rest_f_p-1)] ==
-//         // 71177117) printf("un-padding3 error block: (%d %d %d) thread: (%d
-//         %d
-//         // %d) un-padding (%d %d %d) %f (%d %d %d)\n", blockIdx.z,
-//         blockIdx.y,
-//         // blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x,
-//         //   rest_r-1, c_sm, rest_f-1,
-//         //     v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, c_sm, rest_f_p-1)],
-//         //     rest_r_p-1, c_sm, rest_f_p-1);
-//       }
-//       if (nr % 2 == 0 && nf % 2 != 0 && R * 2 + 1 >= rest_r_p &&
-//           F * 2 + 1 == rest_f) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)];
-//       }
-//       if (nr % 2 != 0 && nf % 2 == 0 && R * 2 + 1 == rest_r &&
-//           F * 2 + 1 >= rest_f_p) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)];
-//           // printf("(%d %d %d): %f <- (%d %d %d)\n",
-//           //         r_gl_ex, c_gl, rest_f-1,
-//           //         dv[get_idx(lddv1, lddv2, r_gl_ex-1, c_gl, f_gl_ex)],
-//           //         r_sm_ex, c_sm, rest_f_p-1);
-//       }
-//     }
-
-//     if (D >= 3 && r_sm == 0 && c_sm == 0) {
-//       if (nr % 2 != 0 && R * 2 + 1 == rest_r && nc % 2 != 0 &&
-//           C * 2 + 1 == rest_c) {
-//           dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] =
-//               v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)];
-//       }
-//       if (nr % 2 == 0 && nc % 2 == 0 && R * 2 + 1 >= rest_r_p &&
-//           C * 2 + 1 >= rest_c_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm)];
-//         // if ( v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, rest_c_p-1, f_sm)] ==
-//         // 71177117) printf("un-padding4 error block: (%d %d %d) thread: (%d
-//         %d
-//         // %d) un-padding (%d %d %d) %f (%d %d %d)\n", blockIdx.z,
-//         blockIdx.y,
-//         // blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x,
-//         //   rest_r-1, rest_c-1, f_sm,
-//         //     v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, rest_c_p-1, f_sm)],
-//         //     rest_r_p-1, rest_c_p-1, f_sm);
-//       }
-//       if (nr % 2 == 0 && nc % 2 != 0 && R * 2 + 1 >= rest_r_p &&
-//           C * 2 + 1 == rest_c) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)];
-//       }
-//       if (nr % 2 != 0 && nc % 2 == 0 && R * 2 + 1 == rest_r &&
-//           C * 2 + 1 >= rest_c_p) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)];
-//       }
-//     }
-//     // load extra vertex
-
-//     if (D >= 3 && r_sm == 0 && c_sm == 0 && f_sm == 0) {
-//       if (nr % 2 != 0 && R * 2 + 1 == rest_r && nc % 2 != 0 &&
-//           C * 2 + 1 == rest_c && nf % 2 != 0 && F * 2 + 1 == rest_f) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)];
-//       }
-
-//       if (nr % 2 == 0 && nc % 2 == 0 && nf % 2 == 0 && R * 2 + 1 >= rest_r_p
-//       &&
-//           C * 2 + 1 >= rest_c_p && F * 2 + 1 >= rest_f_p) {
-//         v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, rest_f - 1)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1,
-//                          rest_f_p - 1)];
-
-//         // printf("block: (%d %d %d) thread: (%d %d %d) un-padding (%d %d %d)
-//         %f
-//         // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//         // threadIdx.y, threadIdx.x, rest_r-1, rest_c-1, rest_f-1,
-//         //     v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c-1, rest_f-1)],
-//         //     rest_r_p-1, rest_c_p-1, rest_f_p-1);
-//       }
-//       if (nr % 2 == 0 && nc % 2 == 0 && nf % 2 != 0 && R * 2 + 1 >= rest_r_p
-//       &&
-//           C * 2 + 1 >= rest_c_p && F * 2 + 1 == rest_f) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm_ex)];
-//       }
-//       if (nr % 2 == 0 && nc % 2 != 0 && nf % 2 == 0 && R * 2 + 1 >= rest_r_p
-//       &&
-//           C * 2 + 1 == rest_c && F * 2 + 1 >= rest_f_p) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p - 1)];
-//       }
-//       if (nr % 2 != 0 && nc % 2 == 0 && nf % 2 == 0 && R * 2 + 1 == rest_r &&
-//           C * 2 + 1 >= rest_c_p && F * 2 + 1 >= rest_f_p) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p - 1)];
-//       }
-//       if (nr % 2 == 0 && nc % 2 != 0 && nf % 2 != 0 && R * 2 + 1 >= rest_r_p
-//       &&
-//           C * 2 + 1 == rest_c && F * 2 + 1 == rest_f) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)];
-//       }
-//       if (nr % 2 != 0 && nc % 2 == 0 && nf % 2 != 0 && R * 2 + 1 == rest_r &&
-//           C * 2 + 1 >= rest_c_p && F * 2 + 1 == rest_f) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)];
-//       }
-//       if (nr % 2 != 0 && nc % 2 != 0 && nf % 2 == 0 && R * 2 + 1 == rest_r &&
-//           C * 2 + 1 == rest_c && F * 2 + 1 >= rest_f_p) {
-//         dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] =
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)];
-//       }
-//     }
-//   }
-
-//   __syncthreads();
-
-//   if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) {
-//     if (r_gl >= svr && r_gl < svr + nvr && c_gl >= svc && c_gl < svc + nvc &&
-//         f_gl >= svf && f_gl < svf + nvf) {
-//       dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)] =
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)];
-
-//       // if (c_gl == nc - 1 && f_gl == nf - 1) {
-//       //   printf("block: (%d %d %d) thread: (%d %d %d) store (%d %d %d) %f
-//       //   (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z,
-//       //   threadIdx.y, threadIdx.x, r_gl, c_gl, f_gl,
-//       //     v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], r_sm, c_sm,
-//       f_sm);
-//       // }
-//     }
-//   }
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// void gpk_rev_3d_adaptive_launcher(
-//     Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r, T
-//     *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw, SIZE lddw1,
-//     SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2, T *dwc, SIZE lddwc1, SIZE
-//     lddwc2, T *dwr, SIZE lddwr1, SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE
-//     lddwcf2, T *dwrf, SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, SIZE
-//     lddwrc2, T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, SIZE svr, SIZE svc, SIZE
-//     svf, SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx) {
-//   cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
-//   cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
-//   SIZE nr_c = nr / 2 + 1;
-//   SIZE nc_c = nc / 2 + 1;
-//   SIZE nf_c = nf / 2 + 1;
-//   SIZE total_thread_z = std::max(nr - 1, (SIZE)1);
-//   SIZE total_thread_y = std::max(nc - 1, (SIZE)1);
-//   SIZE total_thread_x = std::max(nf - 1, (SIZE)1);
-
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   // tbz = std::min(R, total_thread_z);
-//   // tby = std::min(C, total_thread_y);
-//   // tbx = std::min(F, total_thread_x);
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = ((R + 1) * (C + 1) * (F + 1) + R + C + F) * sizeof(T);
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-//   // printf("prolongate exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx,
-//   gridy,
-//   // gridz);
-//   _gpk_rev_3d<D, T, R / 2, C / 2, F / 2>
-//       <<<blockPerGrid, threadsPerBlock, sm_size,
-//          *(cudaStream_t *)handle.get(queue_idx)>>>(
-//           nr, nc, nf, nr_c, nc_c, nf_c, dratio_r, dratio_c, dratio_f, dv,
-//           lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1,
-//           lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1,
-//           lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2, svr,
-//           svc, svf, nvr, nvc, nvf);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void gpk_rev_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r,
-//                 T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T
-//                 *dw, SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE
-//                 lddwf2, T *dwc, SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE
-//                 lddwr1, SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE lddwcf2, T
-//                 *dwrf, SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1,
-//                 SIZE lddwrc2, T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, SIZE
-//                 svr, SIZE svc, SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf, int
-//                 queue_idx, int config) {
-
-//   #define GPK(R, C, F)                                                           \
-//   {                                                                            \
-//     gpk_rev_3d_adaptive_launcher<D, T, R, C, F>(                               \
-//         handle, nr, nc, nf, dratio_r, dratio_c, dratio_f, dv, lddv1, lddv2,    \
-//         dw, lddw1, lddw2, dwf, lddwf1, lddwf2,\ 
-//                             dwc,                                               \
-//         lddwc1, lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2,\ 
-//                             dwrf,                                              \
-//         lddwrf1, lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2,   \
-//         svr, svc, svf, nvr, nvc, nvf, queue_idx);                              \
-//   }
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D == 3) {
-//     if (profile || config == 6) {
-//       GPK(2, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(2, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(4, 4, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(4, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(4, 4, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       GPK(2, 2, 2)
-//     }
-//   } else if (D == 2) {
-//     if (profile || config == 6) {
-//       GPK(1, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(1, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(1, 4, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(1, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(1, 4, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(1, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       GPK(1, 2, 4)
-//     }
-//   } else if (D == 1) {
-//     if (profile || config == 6) {
-//       GPK(1, 1, 128)
-//     }
-//     if (profile || config == 5) {
-//       GPK(1, 1, 64)
-//     }
-//     if (profile || config == 4) {
-//       GPK(1, 1, 32)
-//     }
-//     if (profile || config == 3) {
-//       GPK(1, 1, 16)
-//     }
-//     if (profile || config == 2) {
-//       GPK(1, 1, 8)
-//     }
-//     if (profile || config == 1) {
-//       GPK(1, 1, 8)
-//     }
-//     if (profile || config == 0) {
-//       GPK(1, 1, 8)
-//     }
-//   }
-//   #undef GPK
-// }
-
 } // namespace mgard_x
 
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.hpp
new file mode 100644
index 0000000000..2cabdfb5c0
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "../Correction/LevelwiseProcessingKernel.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_ADD_ND
+#define MGARD_X_DATA_REFACTORING_ADD_ND
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void AddND(SubArray<D, T, DeviceType> dinput,
+           SubArray<D, T, DeviceType> &doutput, int queue_idx) {
+
+  LwpkReo<D, T, ADD, DeviceType>().Execute(dinput, doutput, queue_idx);
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.hpp
new file mode 100644
index 0000000000..9783ded972
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "../Correction/LevelwiseProcessingKernel.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_COPY_ND
+#define MGARD_X_DATA_REFACTORING_COPY_ND
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CopyND(SubArray<D, T, DeviceType> dinput,
+            SubArray<D, T, DeviceType> &doutput, int queue_idx) {
+
+  LwpkReo<D, T, COPY, DeviceType>().Execute(dinput, doutput, queue_idx);
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.hpp
new file mode 100644
index 0000000000..8187f8b32f
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "../Correction/LevelwiseProcessingKernel.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_SUBTRACT_ND
+#define MGARD_X_DATA_REFACTORING_SUBTRACT_ND
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void SubtractND(SubArray<D, T, DeviceType> dinput,
+                SubArray<D, T, DeviceType> &doutput, int queue_idx) {
+
+  LwpkReo<D, T, SUBTRACT, DeviceType>().Execute(dinput, doutput, queue_idx);
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.hpp
new file mode 100644
index 0000000000..97cf131552
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.hpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "IterativeProcessingKernel3D.hpp"
+#include "LinearProcessingKernel3D.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_CALC_CORRECTION_3D
+#define MGARD_X_DATA_REFACTORING_CALC_CORRECTION_3D
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCorrection3D(Hierarchy<D, T, DeviceType> &hierarchy,
+                      SubArray<D, T, DeviceType> dcoeff,
+                      SubArray<D, T, DeviceType> &dcorrection, SIZE l,
+                      int queue_idx) {
+
+  std::string prefix = "decomp_";
+  if (sizeof(T) == sizeof(double))
+    prefix += "d_";
+  if (sizeof(T) == sizeof(float))
+    prefix += "f_";
+  for (int d = 0; d < D; d++)
+    prefix += std::to_string(hierarchy.shape[d]) + "_";
+
+  SubArray<D, T, DeviceType> dw_in1, dw_in2, dw_out;
+
+  if (D >= 1) {
+    dw_in1 = dcoeff;
+    dw_in1.resize(
+        {hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l], hierarchy.dofs[2][l]});
+    dw_in2 = dcoeff;
+    dw_in2.offset({hierarchy.dofs[0][l + 1], 0, 0});
+    dw_in2.resize({hierarchy.dofs[0][l] - hierarchy.dofs[0][l + 1],
+                   hierarchy.dofs[1][l], hierarchy.dofs[2][l]});
+    dw_out = dcorrection;
+    dw_out.resize(
+        {hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l], hierarchy.dofs[2][l]});
+
+    Lpk1Reo3D<D, T, DeviceType>().Execute(
+        hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l],
+        hierarchy.dofs[0][l + 1], hierarchy.dofs[2][l + 1],
+        hierarchy.dofs[1][l + 1], hierarchy.dofs[0][l + 1],
+        SubArray(hierarchy.dist_array[0][l]),
+        SubArray(hierarchy.ratio_array[0][l]), dw_in1, dw_in2, dw_out,
+        queue_idx);
+
+    verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l],
+                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
+                       dw_out.getLd(1), dw_out.getLd(0),
+                       prefix + "lpk_reo_1_3d" + "_level_" + std::to_string(l),
+                       multidim_refactoring_store, multidim_refactoring_verify);
+
+    if (multidim_refactoring_debug_print) {
+      PrintSubarray("after mass_trans_multiply_1_cpt", dw_out);
+    }
+  }
+
+  if (D >= 2) {
+    dw_in1 = dw_out;
+    dw_in1.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
+                   hierarchy.dofs[2][l]});
+    dw_in2 = dw_out;
+    dw_in2.offset({0, hierarchy.dofs[1][l + 1], 0});
+    dw_in2.resize({hierarchy.dofs[0][l + 1],
+                   hierarchy.dofs[1][l] - hierarchy.dofs[1][l + 1],
+                   hierarchy.dofs[2][l]});
+    dw_out.offset({hierarchy.dofs[0][l + 1], 0, 0});
+    dw_out.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
+                   hierarchy.dofs[2][l]});
+
+    Lpk2Reo3D<D, T, DeviceType>().Execute(
+        hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l + 1],
+        hierarchy.dofs[1][l + 1], SubArray(hierarchy.dist_array[1][l]),
+        SubArray(hierarchy.ratio_array[1][l]), dw_in1, dw_in2, dw_out,
+        queue_idx);
+
+    verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l + 1],
+                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
+                       dw_out.getLd(1), dw_out.getLd(0),
+                       prefix + "lpk_reo_2_3d" + "_level_" + std::to_string(l),
+                       multidim_refactoring_store, multidim_refactoring_verify);
+
+    if (multidim_refactoring_debug_print) {
+      PrintSubarray("after mass_trans_multiply_2_cpt", dw_out);
+    }
+  }
+
+  if (D == 3) {
+    dw_in1 = dw_out;
+    dw_in1.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
+                   hierarchy.dofs[2][l + 1]});
+    dw_in2 = dw_out;
+    dw_in2.offset({0, 0, hierarchy.dofs[2][l + 1]});
+    dw_in2.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
+                   hierarchy.dofs[2][l] - hierarchy.dofs[2][l + 1]});
+    dw_out.offset({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], 0});
+    dw_out.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
+                   hierarchy.dofs[2][l + 1]});
+
+    Lpk3Reo3D<D, T, DeviceType>().Execute(
+        hierarchy.dofs[2][l], hierarchy.dofs[1][l + 1],
+        hierarchy.dofs[0][l + 1], hierarchy.dofs[2][l + 1],
+        SubArray(hierarchy.dist_array[2][l]),
+        SubArray(hierarchy.ratio_array[2][l]), dw_in1, dw_in2, dw_out,
+        queue_idx);
+
+    verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
+                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
+                       dw_out.getLd(1), dw_out.getLd(0),
+                       prefix + "lpk_reo_3_3d" + "_level_" + std::to_string(l),
+                       multidim_refactoring_store, multidim_refactoring_verify);
+
+    if (multidim_refactoring_debug_print) {
+      PrintSubarray("after mass_trans_multiply_3_cpt", dw_out);
+    }
+  }
+
+  if (D >= 1) {
+    Ipk1Reo3D<D, T, DeviceType>().Execute(
+        hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
+        hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[0][l + 1]),
+        SubArray(hierarchy.bm_array[0][l + 1]),
+        SubArray(hierarchy.dist_array[0][l + 1]), dw_out, queue_idx);
+    verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
+                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
+                       dw_out.getLd(1), dw_out.getLd(0),
+                       prefix + "ipk_1_3d" + "_level_" + std::to_string(l),
+                       multidim_refactoring_store, multidim_refactoring_verify);
+
+    if (multidim_refactoring_debug_print) {
+      PrintSubarray("after solve_tridiag_1_cpt", dw_out);
+    }
+  }
+  if (D >= 2) {
+    Ipk2Reo3D<D, T, DeviceType>().Execute(
+        hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
+        hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[1][l + 1]),
+        SubArray(hierarchy.bm_array[1][l + 1]),
+        SubArray(hierarchy.dist_array[1][l + 1]), dw_out, queue_idx);
+
+    verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
+                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
+                       dw_out.getLd(1), dw_out.getLd(0),
+                       prefix + "ipk_2_3d" + "_level_" + std::to_string(l),
+                       multidim_refactoring_store, multidim_refactoring_verify);
+
+    if (multidim_refactoring_debug_print) {
+      PrintSubarray("after solve_tridiag_2_cpt", dw_out);
+    }
+  }
+  if (D == 3) {
+    Ipk3Reo3D<D, T, DeviceType>().Execute(
+        hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
+        hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[2][l + 1]),
+        SubArray(hierarchy.bm_array[2][l + 1]),
+        SubArray(hierarchy.dist_array[2][l + 1]), dw_out, queue_idx);
+
+    verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
+                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
+                       dw_out.getLd(1), dw_out.getLd(0),
+                       prefix + "ipk_3_3d" + "_level_" + std::to_string(l),
+                       multidim_refactoring_store, multidim_refactoring_verify);
+
+    if (multidim_refactoring_debug_print) {
+      PrintSubarray("after solve_tridiag_3_cpt", dw_out);
+    }
+  }
+  // final correction output
+  dcorrection = dw_out;
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.hpp
new file mode 100644
index 0000000000..0fd807ea9e
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.hpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "IterativeProcessingKernel.hpp"
+#include "LinearProcessingKernel.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_CALC_CORRECTION_ND
+#define MGARD_X_DATA_REFACTORING_CALC_CORRECTION_ND
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCorrectionND(Hierarchy<D, T, DeviceType> &hierarchy,
+                      SubArray<D, T, DeviceType> dcoeff,
+                      SubArray<D, T, DeviceType> &dcorrection, SIZE l,
+                      int queue_idx) {
+
+  std::string prefix = "decomp_";
+  if (sizeof(T) == sizeof(double))
+    prefix += "d_";
+  if (sizeof(T) == sizeof(float))
+    prefix += "f_";
+  for (int d = 0; d < D; d++)
+    prefix += std::to_string(hierarchy.shape[d]) + "_";
+
+  SubArray<D, T, DeviceType> dw_in1 = dcoeff;
+  SubArray<D, T, DeviceType> dw_in2 = dcoeff;
+  SubArray<D, T, DeviceType> dw_out = dcorrection;
+
+  // start correction calculation
+  int prev_dim_r, prev_dim_c, prev_dim_f;
+  int curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
+
+  dw_in1.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]);
+  dw_in2.offset(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]);
+  dw_in2.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l] -
+                                hierarchy.dofs[curr_dim_f][l + 1]);
+  dw_out.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]);
+
+  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
+
+  Lpk1Reo<D, T, DeviceType>().Execute(
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+      hierarchy.processed_n[0],
+      SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[0], true),
+      curr_dim_r, curr_dim_c, curr_dim_f,
+      SubArray(hierarchy.dist_array[curr_dim_f][l]),
+      SubArray(hierarchy.ratio_array[curr_dim_f][l]), dw_in1, dw_in2, dw_out,
+      queue_idx);
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D(format("decomposition: after MR-1D[{}]", l), dw_out);
+  }
+
+  // mass trans 2D
+  prev_dim_f = curr_dim_f;
+  prev_dim_c = curr_dim_c;
+  prev_dim_r = curr_dim_r;
+  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
+
+  dw_in1 = dw_out;
+  dw_in2 = dw_out;
+  dw_in1.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]);
+  dw_in2.offset(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]);
+  dw_in2.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l] -
+                                hierarchy.dofs[curr_dim_c][l + 1]);
+  dw_out.offset(prev_dim_f, hierarchy.dofs[curr_dim_f][l + 1]);
+  dw_out.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]);
+
+  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
+
+  Lpk2Reo<D, T, DeviceType>().Execute(
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+      hierarchy.processed_n[1],
+      SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[1], true),
+      curr_dim_r, curr_dim_c, curr_dim_f,
+      SubArray(hierarchy.dist_array[curr_dim_c][l]),
+      SubArray(hierarchy.ratio_array[curr_dim_c][l]), dw_in1, dw_in2, dw_out,
+      queue_idx);
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D(format("decomposition: after MR-2D[{}]", l), dw_out);
+  }
+
+  // mass trans 3D
+
+  prev_dim_f = curr_dim_f;
+  prev_dim_c = curr_dim_c;
+  prev_dim_r = curr_dim_r;
+  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
+
+  dw_in1 = dw_out;
+  dw_in2 = dw_out;
+  dw_in1.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
+  dw_in2.offset(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
+  dw_in2.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l] -
+                                hierarchy.dofs[curr_dim_r][l + 1]);
+  dw_out.offset(prev_dim_c, hierarchy.dofs[curr_dim_c][l + 1]);
+  dw_out.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
+
+  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
+
+  Lpk3Reo<D, T, DeviceType>().Execute(
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+      hierarchy.processed_n[2],
+      SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[2], true),
+      curr_dim_r, curr_dim_c, curr_dim_f,
+      SubArray(hierarchy.dist_array[curr_dim_r][l]),
+      SubArray(hierarchy.ratio_array[curr_dim_r][l]), dw_in1, dw_in2, dw_out,
+      queue_idx);
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D(format("decomposition: after MR-3D[{}]", l), dw_out);
+  }
+
+  // mass trans 4D+
+  for (int i = 3; i < D; i++) {
+    prev_dim_f = curr_dim_f;
+    prev_dim_c = curr_dim_c;
+    prev_dim_r = curr_dim_r;
+    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = i;
+    dw_in1 = dw_out;
+    dw_in2 = dw_out;
+    dw_in1.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
+    dw_in2.offset(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
+    dw_in2.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l] -
+                                  hierarchy.dofs[curr_dim_r][l + 1]);
+    dw_out.offset(prev_dim_r, hierarchy.dofs[prev_dim_r][l + 1]);
+    dw_out.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
+
+    dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    Lpk3Reo<D, T, DeviceType>().Execute(
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
+        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
+        hierarchy.processed_n[i],
+        SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[i], true),
+        curr_dim_r, curr_dim_c, curr_dim_f,
+        SubArray(hierarchy.dist_array[curr_dim_r][l]),
+        SubArray(hierarchy.ratio_array[curr_dim_r][l]), dw_in1, dw_in2, dw_out,
+        queue_idx);
+
+    if (multidim_refactoring_debug_print) { // debug
+      PrintSubarray4D(format("decomposition: after MR-{}D[{}]", i + 1, l),
+                      dw_out);
+    }
+  }
+
+  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
+  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  Ipk1Reo<D, T, DeviceType>().Execute(
+      curr_dim_r, curr_dim_c, curr_dim_f,
+      SubArray(hierarchy.am_array[curr_dim_f][l + 1]),
+      SubArray(hierarchy.bm_array[curr_dim_f][l + 1]), dw_out, queue_idx);
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D(format("decomposition: after TR-1D[{}]", l), dw_out);
+  } // debug
+
+  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
+  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  Ipk2Reo<D, T, DeviceType>().Execute(
+      curr_dim_r, curr_dim_c, curr_dim_f,
+      SubArray(hierarchy.am_array[curr_dim_c][l + 1]),
+      SubArray(hierarchy.bm_array[curr_dim_c][l + 1]), dw_out, queue_idx);
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D(format("decomposition: after TR-2D[{}]", l), dw_out);
+  } // debug
+
+  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
+  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
+  Ipk3Reo<D, T, DeviceType>().Execute(
+      curr_dim_r, curr_dim_c, curr_dim_f,
+      SubArray(hierarchy.am_array[curr_dim_r][l + 1]),
+      SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), dw_out, queue_idx);
+
+  if (multidim_refactoring_debug_print) { // debug
+    PrintSubarray4D(format("decomposition: after TR-3D[{}]", l), dw_out);
+  } // debug
+
+  // mass trans 4D+
+  for (int i = 3; i < D; i++) {
+    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = i;
+    dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    Ipk3Reo<D, T, DeviceType>().Execute(
+        curr_dim_r, curr_dim_c, curr_dim_f,
+        SubArray(hierarchy.am_array[curr_dim_r][l + 1]),
+        SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), dw_out, queue_idx);
+    if (multidim_refactoring_debug_print) { // debug
+      PrintSubarray4D(format("decomposition: after TR-{}D[{}]", i + 1, l),
+                      dw_out);
+    } // debug
+  }
+
+  dcorrection = dw_out;
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.h
deleted file mode 100644
index e388e184b3..0000000000
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_ITERATIVE_PROCESSING_KERNEL
-#define MGARD_X_ITERATIVE_PROCESSING_KERNEL
-
-#include "../../Common.h"
-
-namespace mgard_x {
-
-template <uint32_t D, typename T>
-void ipk_1(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE *shape_d,
-           SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n,
-           DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r,
-           DIM curr_dim_c, DIM curr_dim_f, T *am, T *bm, T *ddist_f, T *dv,
-           LENGTH lddv1, LENGTH lddv2, int queue_idx, int config);
-
-template <uint32_t D, typename T>
-void ipk_2(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE *shape_d,
-           SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n,
-           DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r,
-           DIM curr_dim_c, DIM curr_dim_f, T *am, T *bm, T *ddist_c, T *dv,
-           LENGTH lddv1, LENGTH lddv2, int queue_idx, int config);
-
-template <uint32_t D, typename T>
-void ipk_3(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE *shape_d,
-           SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n,
-           DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r,
-           DIM curr_dim_c, DIM curr_dim_f, T *am, T *bm, T *ddist_r, T *dv,
-           LENGTH lddv1, LENGTH lddv2, int queue_idx, int config);
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.hpp
index 1d07134ee8..cb2c4a3029 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.hpp
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.hpp
@@ -618,14 +618,11 @@ class Ipk1Reo : public AutoTuner<DeviceType> {
                SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm,
                SubArray<D, T, DeviceType> v, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(v.getShape(curr_dim_f)) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_ts1[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.ipk1_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define IPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -638,22 +635,26 @@ class Ipk1Reo : public AutoTuner<DeviceType> {
     TaskType task = GenTask<R, C, F, G>(curr_dim_r, curr_dim_c, curr_dim_f,    \
                                         am, bm, v, queue_idx);                 \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    IPK(0)
-    IPK(1)
-    IPK(2)
-    IPK(3)
-    IPK(4)
-    IPK(5)
-    IPK(6)
+    IPK(6) if (!ret.success) config--;
+    IPK(5) if (!ret.success) config--;
+    IPK(4) if (!ret.success) config--;
+    IPK(3) if (!ret.success) config--;
+    IPK(2) if (!ret.success) config--;
+    IPK(1) if (!ret.success) config--;
+    IPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Ipk1Reo.\n";
+      exit(-1);
+    }
 #undef IPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -1320,14 +1321,11 @@ class Ipk2Reo : public AutoTuner<DeviceType> {
                SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm,
                SubArray<D, T, DeviceType> v, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(v.getShape(curr_dim_f)) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_ts2[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.ipk2_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define IPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -1340,22 +1338,26 @@ class Ipk2Reo : public AutoTuner<DeviceType> {
     TaskType task = GenTask<R, C, F, G>(curr_dim_r, curr_dim_c, curr_dim_f,    \
                                         am, bm, v, queue_idx);                 \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    IPK(0)
-    IPK(1)
-    IPK(2)
-    IPK(3)
-    IPK(4)
-    IPK(5)
-    IPK(6)
+    IPK(6) if (!ret.success) config--;
+    IPK(5) if (!ret.success) config--;
+    IPK(4) if (!ret.success) config--;
+    IPK(3) if (!ret.success) config--;
+    IPK(2) if (!ret.success) config--;
+    IPK(1) if (!ret.success) config--;
+    IPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Ipk2Reo.\n";
+      exit(-1);
+    }
 #undef IPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -2047,14 +2049,11 @@ class Ipk3Reo : public AutoTuner<DeviceType> {
                SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm,
                SubArray<D, T, DeviceType> v, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(v.getShape(curr_dim_f)) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_ts3[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.ipk3_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define IPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -2067,22 +2066,26 @@ class Ipk3Reo : public AutoTuner<DeviceType> {
     TaskType task = GenTask<R, C, F, G>(curr_dim_r, curr_dim_c, curr_dim_f,    \
                                         am, bm, v, queue_idx);                 \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    IPK(0)
-    IPK(1)
-    IPK(2)
-    IPK(3)
-    IPK(4)
-    IPK(5)
-    IPK(6)
+    IPK(6) if (!ret.success) config--;
+    IPK(5) if (!ret.success) config--;
+    IPK(4) if (!ret.success) config--;
+    IPK(3) if (!ret.success) config--;
+    IPK(2) if (!ret.success) config--;
+    IPK(1) if (!ret.success) config--;
+    IPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Ipk3Reo.\n";
+      exit(-1);
+    }
 #undef IPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.h
deleted file mode 100644
index bb2a31552f..0000000000
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D
-#define MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D
-
-#include "../../Common.h"
-
-namespace mgard_x {
-
-template <DIM D, typename T>
-void ipk_1_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf_c, T *am, T *bm,
-              T *ddist_f, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx,
-              int config);
-
-template <DIM D, typename T>
-void ipk_2_3d(Handle<D, T> &handle, SIZE nr, SIZE nc_c, SIZE nf_c, T *am, T *bm,
-              T *ddist_c, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx,
-              int config);
-
-template <DIM D, typename T>
-void ipk_3_3d(Handle<D, T> &handle, SIZE nr_c, SIZE nc_c, SIZE nf_c, T *am,
-              T *bm, T *ddist_r, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx,
-              int config);
-
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.hpp
index 655aa451bb..259a03f575 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.hpp
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.hpp
@@ -401,14 +401,11 @@ class Ipk1Reo3D : public AutoTuner<DeviceType> {
                SubArray<1, T, DeviceType> bm, SubArray<1, T, DeviceType> dist_f,
                SubArray<D, T, DeviceType> v, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(nf) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_ts1[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.ipk1_3d[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define IPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -421,22 +418,26 @@ class Ipk1Reo3D : public AutoTuner<DeviceType> {
     TaskType task =                                                            \
         GenTask<R, C, F, G>(nr, nc, nf, am, bm, dist_f, v, queue_idx);         \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    IPK(0)
-    IPK(1)
-    IPK(2)
-    IPK(3)
-    IPK(4)
-    IPK(5)
-    IPK(6)
+    IPK(6) if (!ret.success) config--;
+    IPK(5) if (!ret.success) config--;
+    IPK(4) if (!ret.success) config--;
+    IPK(3) if (!ret.success) config--;
+    IPK(2) if (!ret.success) config--;
+    IPK(1) if (!ret.success) config--;
+    IPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Ipk1Reo3D.\n";
+      exit(-1);
+    }
 #undef IPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -810,14 +811,11 @@ class Ipk2Reo3D : public AutoTuner<DeviceType> {
                SubArray<1, T, DeviceType> bm, SubArray<1, T, DeviceType> dist_c,
                SubArray<D, T, DeviceType> v, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(nf) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_ts2[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.ipk2_3d[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define IPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -830,22 +828,26 @@ class Ipk2Reo3D : public AutoTuner<DeviceType> {
     TaskType task =                                                            \
         GenTask<R, C, F, G>(nr, nc, nf, am, bm, dist_c, v, queue_idx);         \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    IPK(0)
-    IPK(1)
-    IPK(2)
-    IPK(3)
-    IPK(4)
-    IPK(5)
-    IPK(6)
+    IPK(6) if (!ret.success) config--;
+    IPK(5) if (!ret.success) config--;
+    IPK(4) if (!ret.success) config--;
+    IPK(3) if (!ret.success) config--;
+    IPK(2) if (!ret.success) config--;
+    IPK(1) if (!ret.success) config--;
+    IPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Ipk2Reo3D.\n";
+      exit(-1);
+    }
 #undef IPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -1219,14 +1221,11 @@ class Ipk3Reo3D : public AutoTuner<DeviceType> {
                SubArray<1, T, DeviceType> bm, SubArray<1, T, DeviceType> dist_r,
                SubArray<D, T, DeviceType> v, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(nf) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_ts3[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.ipk3_3d[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define IPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -1239,22 +1238,26 @@ class Ipk3Reo3D : public AutoTuner<DeviceType> {
     TaskType task =                                                            \
         GenTask<R, C, F, G>(nr, nc, nf, am, bm, dist_r, v, queue_idx);         \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    IPK(0)
-    IPK(1)
-    IPK(2)
-    IPK(3)
-    IPK(4)
-    IPK(5)
-    IPK(6)
+    IPK(6) if (!ret.success) config--;
+    IPK(5) if (!ret.success) config--;
+    IPK(4) if (!ret.success) config--;
+    IPK(3) if (!ret.success) config--;
+    IPK(2) if (!ret.success) config--;
+    IPK(1) if (!ret.success) config--;
+    IPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Ipk3Reo3D.\n";
+      exit(-1);
+    }
 #undef IPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -1263,1162 +1266,6 @@ class Ipk3Reo3D : public AutoTuner<DeviceType> {
   }
 };
 
-// template <typename T, SIZE R, SIZE C, SIZE F, SIZE G>
-// __global__ void _ipk_1_3d(SIZE nr, SIZE nc, SIZE nf_c, T *am, T *bm, T
-// *dist_f,
-//                           T *v, SIZE ldv1, SIZE ldv2) {
-
-//   SIZE c_gl = blockIdx.x * C;
-//   SIZE r_gl = blockIdx.y * R;
-//   SIZE f_gl = threadIdx.x;
-
-//   SIZE c_sm = threadIdx.x;
-//   SIZE r_sm = threadIdx.y;
-//   SIZE f_sm = threadIdx.x;
-
-//   T *vec = v + get_idx(ldv1, ldv2, r_gl, c_gl, 0);
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = F + G;
-//   SIZE ldsm2 = C;
-//   T *vec_sm = sm;
-//   T *am_sm = sm + R * ldsm1 * ldsm2;
-//   T *bm_sm = am_sm + ldsm1;
-
-//   T prev_vec_sm = 0.0;
-
-//   SIZE c_rest = min(C, nc - blockIdx.x * C);
-//   SIZE r_rest = min(R, nr - blockIdx.y * R);
-
-//   // printf("r_rest: %u, c_rest: %u\n", r_rest, c_rest);
-//   // printf("RCF: %u %u %u\n", R,C,F);
-//   // printf("n: %u %u %u\n", nr, nc, nf_c);
-
-//   SIZE f_rest = nf_c;
-//   SIZE f_ghost = min(nf_c, G);
-//   // printf("G%u, f_ghost:%u\n ", G, f_ghost);
-//   SIZE f_main = F;
-
-//   /* Load first ghost */
-//   if (r_sm < r_rest && f_sm < f_ghost) {
-//     for (SIZE i = 0; i < c_rest; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-//           vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)];
-//     }
-//     if (r_sm == 0) {
-//       am_sm[f_sm] = am[f_gl];
-//       bm_sm[f_sm] = bm[f_gl];
-//       // printf("am[%u]: %f, bm[%u]: %f\n", f_sm, f_sm, am_sm[f_sm],
-//       bm_sm[f_sm]);
-//     }
-//   }
-
-//   f_rest -= f_ghost;
-//   __syncthreads();
-
-//   while (f_rest > F - f_ghost) {
-//     f_main = min(F, f_rest);
-//     if (r_sm < r_rest && f_sm < f_main) {
-//       for (SIZE i = 0; i < c_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] =
-//             vec[get_idx(ldv1, ldv2, r_sm, i, f_gl + f_ghost)];
-//       }
-//       if (r_sm == 0) {
-//         am_sm[f_sm + f_ghost] = am[f_gl + f_ghost];
-//         bm_sm[f_sm + f_ghost] = bm[f_gl + f_ghost];
-//       }
-//     }
-
-//     __syncthreads();
-
-//     /* Computation of v in parallel*/
-//     if (r_sm < r_rest && c_sm < c_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward2(
-//           prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm,
-//           c_sm, 0)]);
-
-//       //#pragma unroll 32
-//       for (SIZE i = 1; i < F; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_forward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], am_sm[i],
-//             bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]);
-//       }
-
-//       /* Store last v */
-//       prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, F - 1)];
-//     }
-//     __syncthreads();
-
-//     /* flush results to v */
-//     if (r_sm < r_rest && f_sm < F) {
-//       for (SIZE i = 0; i < c_rest; i++) {
-//         vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-//       }
-//     }
-//     __syncthreads();
-
-//     /* Update unloaded col */
-//     f_rest -= f_main;
-
-//     /* Advance c */
-//     f_gl += F;
-
-//     /* Copy next ghost to main */
-//     f_ghost = min(G, f_main - (F - G));
-//     if (r_sm < r_rest && f_sm < f_ghost) {
-//       for (SIZE i = 0; i < c_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + F)];
-//       }
-//       if (r_sm == 0) {
-//         am_sm[f_sm] = am_sm[f_sm + blockDim.x];
-//         bm_sm[f_sm] = bm_sm[f_sm + blockDim.x];
-//       }
-//     }
-//     __syncthreads();
-//   } // end of while
-
-//   /* Load all rest col */
-//   if (r_sm < r_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < c_rest; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] =
-//           vec[get_idx(ldv1, ldv2, r_sm, i, f_gl + f_ghost)];
-//     }
-//     if (r_sm == 0) {
-//       am_sm[f_sm + f_ghost] = am[f_gl + f_ghost];
-//       bm_sm[f_sm + f_ghost] = bm[f_gl + f_ghost];
-//     }
-//   }
-
-//   __syncthreads();
-
-//   /* Only 1 col remain */
-//   if (f_ghost + f_rest == 1) {
-//     if (r_sm < r_rest && c_sm < c_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward2(
-//           prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm,
-//           c_sm, 0)]);
-//     }
-//     //__syncthreads();
-
-//   } else {
-//     if (r_sm < r_rest && c_sm < c_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward2(
-//           prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm,
-//           c_sm, 0)]);
-//       for (SIZE i = 1; i < f_ghost + f_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_forward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], am_sm[i],
-//             bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]);
-//       }
-//     }
-//   }
-//   __syncthreads();
-//   /* flush results to v */
-//   if (r_sm < r_rest && f_sm < f_ghost + f_rest) {
-//     for (SIZE i = 0; i < c_rest; i++) {
-//       vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)] =
-//           vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-//     }
-//   }
-//   __syncthreads();
-
-//   /* backward */
-//   f_rest = nf_c;
-//   f_ghost = min(nf_c, G);
-//   f_main = F;
-//   f_gl = threadIdx.x;
-//   prev_vec_sm = 0.0;
-
-//   /* Load first ghost */
-//   if (r_sm < r_rest && f_sm < f_ghost) {
-//     for (SIZE i = 0; i < c_rest; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-//           vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)];
-//     }
-//   }
-//   if (r_sm == 0 && f_gl <= nf_c) {
-//     am_sm[f_sm] = am[nf_c - f_gl];
-//     bm_sm[f_sm] = bm[nf_c - f_gl]; // * -1;
-//   }
-//   f_rest -= f_ghost;
-//   __syncthreads();
-
-//   while (f_rest > F - f_ghost) {
-//     f_main = min(F, f_rest);
-//     if (r_sm < r_rest && f_sm < f_main) {
-//       for (SIZE i = 0; i < c_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] =
-//             vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl - f_ghost)];
-//       }
-//     }
-//     if (r_sm == 0 && f_gl + f_ghost <= nf_c) {
-//       am_sm[f_sm + f_ghost] = am[nf_c - f_gl - f_ghost];
-//       bm_sm[f_sm + f_ghost] = bm[nf_c - f_gl - f_ghost]; // * -1;
-//     }
-//     __syncthreads();
-
-//     /* Computation of v in parallel*/
-//     if (r_sm < r_rest && c_sm < c_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-//           tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0],
-//                            vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-//       //#pragma unroll 32
-//       for (SIZE i = 1; i < F; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_backward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)],
-//             am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm,
-//             i)]);
-//       }
-//       /* Store last v */
-//       prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, blockDim.x -
-//       1)];
-//     }
-//     __syncthreads();
-
-//     /* flush results to v */
-//     if (r_sm < r_rest && f_sm < F) {
-//       for (SIZE i = 0; i < c_rest; i++) {
-//         vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-//       }
-//     }
-//     __syncthreads();
-
-//     /* Update unloaded col */
-//     f_rest -= f_main;
-
-//     /* Advance c */
-//     f_gl += F;
-
-//     /* Copy next ghost to main */
-//     f_ghost = min(G, f_main - (F - G));
-//     if (r_sm < r_rest && f_sm < f_ghost) {
-//       for (SIZE i = 0; i < c_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + F)];
-//       }
-//       if (r_sm == 0) {
-//         am_sm[f_sm] = am_sm[f_sm + F];
-//         bm_sm[f_sm] = bm_sm[f_sm + F];
-//       }
-//     }
-//     __syncthreads();
-//   } // end of while
-
-//   /* Load all rest col */
-//   if (r_sm < r_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < c_rest; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] =
-//           vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl - f_ghost)];
-//     }
-//   }
-//   if (r_sm == 0 && f_gl + f_ghost <= nf_c) {
-//     am_sm[f_sm + f_ghost] = am[nf_c - f_gl - f_ghost];
-//     bm_sm[f_sm + f_ghost] = bm[nf_c - f_gl - f_ghost];
-//     // printf("%u %u\n", f_gl, f_ghost);
-//   }
-//   __syncthreads();
-
-//   /* Only 1 col remain */
-//   if (f_ghost + f_rest == 1) {
-//     if (r_sm < r_rest && c_sm < c_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-//           tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0],
-//                            vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-//     }
-//     //__syncthreads();
-
-//   } else {
-//     if (r_sm < r_rest && c_sm < c_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-//           tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0],
-//                            vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-//       for (SIZE i = 1; i < f_ghost + f_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_backward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)],
-//             am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm,
-//             i)]);
-//       }
-//     }
-//   }
-//   __syncthreads();
-//   /* flush results to v */
-//   if (r_sm < r_rest && f_sm < f_ghost + f_rest) {
-//     for (SIZE i = 0; i < c_rest; i++) {
-//       vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)] =
-//           vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-//     }
-//   }
-//   __syncthreads();
-// }
-
-// template <uint32_t D, typename T, SIZE R, SIZE C, SIZE F, SIZE G>
-// void ipk_1_3d_adaptive_launcher(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE
-// nf_c,
-//                                 T *am, T *bm, T *ddist_f, T *dv, SIZE lddv1,
-//                                 SIZE lddv2, int queue_idx) {
-//   // std::cout << "test\n";
-
-//   SIZE total_thread_x = nc;
-//   SIZE total_thread_y = nr;
-//   SIZE total_thread_z = 1;
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   SIZE sm_size;
-
-//   tbx = C;//std::max(C, std::min(C, total_thread_x));
-//   tby = R;//std::max(R, std::min(R, total_thread_y));
-//   tbz = 1;
-//   sm_size = (R * C + 2) * (F + G) * sizeof(T);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridz = 1;
-//   threadsPerBlock = dim3(F, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-
-//   _ipk_1_3d<T, R, C, F, G><<<blockPerGrid, threadsPerBlock, sm_size,
-//                              *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       nr, nc, nf_c, am, bm, ddist_f, dv, lddv1, lddv2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-//   // std::cout << "test\n";
-// }
-
-// template <uint32_t D, typename T>
-// void ipk_1_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf_c, T *am, T
-// *bm,
-//               T *ddist_f, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx,
-//               int config) {
-
-// #define IPK(R, C, F, G)                                                        \
-//   {                                                                            \
-//     ipk_1_3d_adaptive_launcher<D, T, R, C, F, G>(                              \
-//         handle, nr, nc, nf_c, am, bm, ddist_f, dv, lddv1, lddv2, queue_idx);   \
-//   }
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D == 3) {
-//     if (profile || config == 6) {
-//       IPK(2, 2, 128, 2)
-//     }
-//     if (profile || config == 5) {
-//       IPK(2, 2, 64, 2)
-//     }
-//     if (profile || config == 4) {
-//       IPK(2, 2, 32, 2)
-//     }
-//     if (profile || config == 3) {
-//       IPK(4, 4, 16, 4)
-//     }
-//     if (profile || config == 2) {
-//       IPK(8, 8, 8, 4)
-//     }
-//     if (profile || config == 1) {
-//       IPK(4, 4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       IPK(2, 2, 2, 2)
-//     }
-//   } else if (D == 2) {
-//     if (profile || config == 6) {
-//       IPK(1, 2, 128, 2)
-//     }
-//     if (profile || config == 5) {
-//       IPK(1, 2, 64, 2)
-//     }
-//     if (profile || config == 4) {
-//       IPK(1, 2, 32, 2)
-//     }
-//     if (profile || config == 3) {
-//       IPK(1, 4, 16, 4)
-//     }
-//     if (profile || config == 2) {
-//       IPK(1, 8, 8, 4)
-//     }
-//     if (profile || config == 1) {
-//       IPK(1, 4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       IPK(1, 2, 4, 2)
-//     }
-//   } else if (D == 1) {
-//     if (profile || config == 6) {
-//       IPK(1, 1, 128, 2)
-//     }
-//     if (profile || config == 5) {
-//       IPK(1, 1, 64, 2)
-//     }
-//     if (profile || config == 4) {
-//       IPK(1, 1, 32, 2)
-//     }
-//     if (profile || config == 3) {
-//       IPK(1, 1, 16, 4)
-//     }
-//     if (profile || config == 2) {
-//       IPK(1, 1, 8, 4)
-//     }
-//     if (profile || config == 1) {
-//       IPK(1, 1, 8, 4)
-//     }
-//     if (profile || config == 0) {
-//       IPK(1, 1, 8, 2)
-//     }
-//   }
-// #undef IPK
-// }
-
-// template <typename T, SIZE R, SIZE C, SIZE F, SIZE G>
-// __global__ void _ipk_2_3d(SIZE nr, SIZE nc_c, SIZE nf_c, T *am, T *bm, T
-// *dist_c,
-//                           T *v, SIZE ldv1, SIZE ldv2) {
-
-//   SIZE f_gl = blockIdx.x * F;
-//   SIZE r_gl = blockIdx.y * R;
-//   SIZE c_gl = 0;
-
-//   SIZE f_sm = threadIdx.x;
-//   SIZE r_sm = threadIdx.y;
-//   SIZE c_sm = threadIdx.x;
-
-//   T *vec = v + get_idx(ldv1, ldv2, r_gl, 0, f_gl);
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = F;
-//   SIZE ldsm2 = C + G;
-//   T *vec_sm = sm;
-//   T *am_sm = sm + R * ldsm1 * ldsm2;
-//   T *bm_sm = am_sm + ldsm2;
-
-//   T prev_vec_sm = 0.0;
-
-//   SIZE f_rest = min(F, nf_c - blockIdx.x * F);
-//   SIZE r_rest = min(R, nr - blockIdx.y * R);
-
-//   // if (blockIdx.x == 1 && blockIdx.y == 0 && f_sm == 0 && r_sm == 0) {
-//   //   prSIZEf("f_rest: %d r_rest: %d\n", f_rest, r_rest);
-//   // }
-
-//   SIZE c_rest = nc_c;
-//   SIZE c_ghost = min(nc_c, G);
-//   SIZE c_main = C;
-
-//   /* Load first ghost */
-//   if (r_sm < r_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < c_ghost; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-//           vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)];
-//       // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n",
-//       r0_stride,
-//       // i, vec_sm[i * ldsm + c_sm]);
-//     }
-//   }
-//   if (r_sm == 0 && c_sm < c_ghost) {
-//     am_sm[c_sm] = am[c_gl + c_sm];
-//     bm_sm[c_sm] = bm[c_gl + c_sm];
-//   }
-//   c_rest -= c_ghost;
-//   __syncthreads();
-
-//   while (c_rest > C - c_ghost) {
-//     // printf("%d %d %d\n", c_rest, C, c_ghost);
-//     c_main = min(C, c_rest);
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < c_main; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] =
-//             vec[get_idx(ldv1, ldv2, r_sm, c_gl + i + c_ghost, f_sm)];
-//       }
-//     }
-//     if (r_sm == 0 && c_sm < c_main){
-//       am_sm[c_sm + c_ghost] = am[c_gl + c_sm + c_ghost];
-//       bm_sm[c_sm + c_ghost] = bm[c_gl + c_sm + c_ghost];
-//     }
-//     __syncthreads();
-
-//     /* Computation of v in parallel*/
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward2(
-//           prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm,
-//           0, f_sm)]);
-
-//       for (SIZE i = 1; i < C; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_forward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], am_sm[i],
-//             bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-//       }
-//       /* Store last v */
-//       prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, C - 1, f_sm)];
-//     }
-//     __syncthreads();
-
-//     /* flush results to v */
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < C; i++) {
-//         vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-//       }
-//     }
-//     __syncthreads();
-
-//     /* Update unloaded col */
-//     c_rest -= c_main;
-
-//     /* Advance c */
-//     c_gl += C;
-
-//     /* Copy next ghost to main */
-//     c_ghost = min(G, c_main - (C - G));
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < c_ghost; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + C, f_sm)];
-//       }
-//     }
-//     if (r_sm == 0 && c_sm < c_ghost) {
-//       am_sm[c_sm] = am_sm[c_sm + C];
-//       bm_sm[c_sm] = bm_sm[c_sm + C];
-//     }
-//     __syncthreads();
-
-//   } // end of while
-
-//   /* Load all rest col */
-//   if (r_sm < r_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < c_rest; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] =
-//           vec[get_idx(ldv1, ldv2, r_sm, c_gl + i + c_ghost, f_sm)];
-//     }
-//   }
-//   if (r_sm == 0 && c_sm < c_rest) {
-//     am_sm[c_sm + c_ghost] = am[c_gl + c_sm + c_ghost];
-//     bm_sm[c_sm + c_ghost] = bm[c_gl + c_sm + c_ghost];
-//   }
-//   __syncthreads();
-
-//   /* Only 1 col remain */
-//   if (c_ghost + c_rest == 1) {
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward2(
-//           prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm,
-//           0, f_sm)]);
-//     }
-//     //__syncthreads();
-
-//   } else {
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward2(
-//           prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm,
-//           0, f_sm)]);
-//       for (SIZE i = 1; i < c_ghost + c_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_forward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], am_sm[i],
-//             bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-//       }
-//     }
-//   }
-//   __syncthreads();
-//   /* flush results to v */
-//   if (r_sm < r_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < c_ghost + c_rest; i++) {
-//       vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)] =
-//           vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-//     }
-//   }
-//   __syncthreads();
-
-//   /* backward */
-//   c_rest = nc_c;
-//   c_ghost = min(nc_c, G);
-//   c_main = C;
-//   c_gl = 0;
-//   prev_vec_sm = 0.0;
-
-//   /* Load first ghost */
-//   if (r_sm < r_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < c_ghost; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-//           vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)];
-//     }
-//   }
-//   if (r_sm == 0 && c_sm < c_ghost) {
-//     am_sm[c_sm] = am[nc_c - (c_gl + c_sm)];
-//     bm_sm[c_sm] = bm[nc_c - (c_gl + c_sm)];
-//   }
-//   c_rest -= c_ghost;
-//   __syncthreads();
-
-//   while (c_rest > C - c_ghost) {
-//     // printf("%d %d %d\n", c_rest, C, c_ghost);
-//     c_main = min(C, c_rest);
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < c_main; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = vec[get_idx(
-//             ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i + c_ghost), f_sm)];
-//       }
-//     }
-//     if (r_sm == 0 && c_sm < c_main) {
-//       am_sm[c_sm + c_ghost] = am[nc_c- (c_gl + c_sm + c_ghost)];
-//       bm_sm[c_sm + c_ghost] = bm[nc_c- (c_gl + c_sm + c_ghost)];
-//     }
-//     __syncthreads();
-
-//     /* Computation of v in parallel*/
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] =
-//           tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0],
-//                            vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]);
-
-//       for (SIZE i = 1; i < C; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_backward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)],
-//             am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i,
-//             f_sm)]);
-//       }
-
-//       /* Store last v */
-//       prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, C - 1, f_sm)];
-//     }
-//     __syncthreads();
-
-//     /* flush results to v */
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < C; i++) {
-//         vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-//       }
-//     }
-//     __syncthreads();
-
-//     /* Update unloaded col */
-//     c_rest -= c_main;
-
-//     /* Advance c */
-//     c_gl += C;
-
-//     /* Copy next ghost to main */
-//     c_ghost = min(G, c_main - (C - G));
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < c_ghost; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + C, f_sm)];
-//       }
-//     }
-//     if (r_sm == 0 && c_sm < c_ghost) {
-//       am_sm[c_sm] = am_sm[c_sm + C];
-//       bm_sm[c_sm] = bm_sm[c_sm + C];
-//     }
-//     __syncthreads();
-
-//   } // end of while
-
-//   // Load all rest col
-//   if (r_sm < r_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < c_rest; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = vec[get_idx(
-//           ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i + c_ghost), f_sm)];
-//     }
-//   }
-//   if (r_sm == 0 && c_sm < c_rest) {
-//     am_sm[c_sm + c_ghost] = am[nc_c - (c_gl + c_sm + c_ghost)];
-//     bm_sm[c_sm + c_ghost] = bm[nc_c - (c_gl + c_sm + c_ghost)];
-//   }
-//   __syncthreads();
-
-//   /* Only 1 col remain */
-//   if (c_ghost + c_rest == 1) {
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] =
-//           tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0],
-//                            vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]);
-//     }
-//     //__syncthreads();
-
-//   } else {
-//     if (r_sm < r_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] =
-//           tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0],
-//                            vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]);
-//       for (SIZE i = 1; i < c_ghost + c_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_backward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)],
-//             am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i,
-//             f_sm)]);
-//       }
-//     }
-//   }
-//   __syncthreads();
-//   /* flush results to v */
-//   if (r_sm < r_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < c_ghost + c_rest; i++) {
-//       vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)] =
-//           vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-//       // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] =
-//       // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv
-//       +
-//       // c_stride, vec[i * row_stride * lddv + c_stride]);
-//     }
-//   }
-//   __syncthreads();
-// }
-
-// template <uint32_t D, typename T, SIZE R, SIZE C, SIZE F, SIZE G>
-// void ipk_2_3d_adaptive_launcher(Handle<D, T> &handle, SIZE nr, SIZE nc_c,
-//                                 SIZE nf_c, T *am, T *bm, T *ddist_c, T *dv,
-//                                 SIZE lddv1, SIZE lddv2, int queue_idx) {
-//   SIZE total_thread_x = nf_c;
-//   SIZE total_thread_y = nr;
-//   SIZE total_thread_z = 1;
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   tbx = F;//std::max(F, std::min(F, total_thread_x));
-//   tby = R;//std::max(R, std::min(R, total_thread_y));
-//   tbz = 1;
-//   sm_size = (R * F + 2) * (C + G) * sizeof(T);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridz = 1;
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-//   _ipk_2_3d<T, R, C, F, G><<<blockPerGrid, threadsPerBlock, sm_size,
-//                              *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       nr, nc_c, nf_c, am, bm, ddist_c, dv, lddv1, lddv2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <uint32_t D, typename T>
-// void ipk_2_3d(Handle<D, T> &handle, SIZE nr, SIZE nc_c, SIZE nf_c, T *am, T
-// *bm,
-//               T *ddist_c, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx,
-//               int config) {
-
-// #define IPK(R, C, F, G)                                                        \
-//   {                                                                            \
-//     ipk_2_3d_adaptive_launcher<D, T, R, C, F, G>(                              \
-//         handle, nr, nc_c, nf_c, am, bm, ddist_c, dv, lddv1, lddv2, queue_idx); \
-//   }
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D == 3) {
-//     if (profile || config == 6) {
-//       IPK(2, 2, 128, 2)
-//     }
-//     if (profile || config == 5) {
-//       IPK(2, 2, 64, 2)
-//     }
-//     if (profile || config == 4) {
-//       IPK(2, 2, 32, 2)
-//     }
-//     if (profile || config == 3) {
-//       IPK(4, 4, 16, 4)
-//     }
-//     if (profile || config == 2) {
-//       IPK(8, 8, 8, 4)
-//     }
-//     if (profile || config == 1) {
-//       IPK(4, 4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       IPK(2, 2, 2, 2)
-//     }
-//   } else if (D == 2) {
-//     if (profile || config == 6) {
-//       IPK(1, 2, 128, 2)
-//     }
-//     if (profile || config == 5) {
-//       IPK(1, 2, 64, 2)
-//     }
-//     if (profile || config == 4) {
-//       IPK(1, 2, 32, 2)
-//     }
-//     if (profile || config == 3) {
-//       IPK(1, 4, 16, 4)
-//     }
-//     if (profile || config == 2) {
-//       IPK(1, 8, 8, 4)
-//     }
-//     if (profile || config == 1) {
-//       IPK(1, 4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       IPK(1, 2, 4, 2)
-//     }
-//   } else {
-//     printf("Error: ipk_2_3d is only for 3D and 2D data\n");
-//   }
-// #undef IPK
-// }
-
-// template <typename T, SIZE R, SIZE C, SIZE F, SIZE G>
-// __global__ void _ipk_3_3d(SIZE nr_c, SIZE nc_c, SIZE nf_c, T *am, T *bm, T
-// *dist_r,
-//                           T *v, SIZE ldv1, SIZE ldv2) {
-
-//   SIZE f_gl = blockIdx.x * F;
-//   SIZE c_gl = blockIdx.y * C;
-//   SIZE r_gl = 0;
-
-//   SIZE f_sm = threadIdx.x;
-//   SIZE c_sm = threadIdx.y;
-//   SIZE r_sm = threadIdx.x;
-
-//   T *vec = v + get_idx(ldv1, ldv2, 0, c_gl, f_gl);
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = F;
-//   SIZE ldsm2 = C;
-//   T *vec_sm = sm;
-//   T *am_sm = sm + (R + G) * ldsm1 * ldsm2;
-//   T *bm_sm = am_sm + (R + G);
-
-//   T prev_vec_sm = 0.0;
-
-//   SIZE f_rest = min(F, nf_c - blockIdx.x * F);
-//   SIZE c_rest = min(C, nc_c - blockIdx.y * C);
-
-//   SIZE r_rest = nr_c;
-//   SIZE r_ghost = min(nr_c, G);
-//   SIZE r_main = R;
-
-//   /* Load first ghost */
-//   if (c_sm < c_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < r_ghost; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-//           vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)];
-//     }
-//   }
-
-//   if (c_sm == 0 && r_sm < r_ghost) {
-//     am_sm[r_sm] = am[r_gl + r_sm];
-//     bm_sm[r_sm] = bm[r_gl + r_sm];
-//   }
-//   r_rest -= r_ghost;
-//   __syncthreads();
-
-//   while (r_rest > R - r_ghost) {
-//     r_main = min(R, r_rest);
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < r_main; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] =
-//             vec[get_idx(ldv1, ldv2, r_gl + i + r_ghost, c_sm, f_sm)];
-//       }
-//     }
-//     if (c_sm == 0 && r_sm < r_main) {
-//       am_sm[r_sm + r_ghost] = am[r_gl + r_sm + r_ghost];
-//       bm_sm[r_sm + r_ghost] = bm[r_gl + r_sm + r_ghost];
-//     }
-//     __syncthreads();
-
-//     /* Computation of v in parallel*/
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward2(
-//           prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0,
-//           c_sm, f_sm)]);
-//       for (SIZE i = 1; i < R; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_forward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], am_sm[i],
-//             bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]);
-//       }
-
-//       /* Store last v */
-//       prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, R - 1, c_sm, f_sm)];
-//     }
-//     __syncthreads();
-
-//     /* flush results to v */
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < R; i++) {
-//         vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)];
-//       }
-//     }
-//     __syncthreads();
-
-//     // /* Update unloaded col */
-//     r_rest -= r_main;
-
-//     /* Advance c */
-//     r_gl += R;
-
-//     /* Copy next ghost to main */
-//     r_ghost = min(G, r_main - (R - G));
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < r_ghost; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, i + R, c_sm, f_sm)];
-//       }
-//     }
-//     if (c_sm == 0 && r_sm < r_ghost) {
-//       am_sm[r_sm] = am_sm[r_sm + R];
-//       bm_sm[r_sm] = bm_sm[r_sm + R];
-//     }
-//     __syncthreads();
-
-//   } // end of while
-
-//   /* Load all rest col */
-//   if (c_sm < c_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < r_rest; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] =
-//           vec[get_idx(ldv1, ldv2, r_gl + i + r_ghost, c_sm, f_sm)];
-//     }
-//   }
-
-//   if (c_sm == 0 && r_sm < r_rest) {
-//     am_sm[r_sm + r_ghost] = am[r_gl + r_sm + r_ghost];
-//     bm_sm[r_sm + r_ghost] = bm[r_gl + r_sm + r_ghost];
-//   }
-//   __syncthreads();
-
-//   /* Only 1 col remain */
-//   if (r_ghost + r_rest == 1) {
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward2(
-//           prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0,
-//           c_sm, f_sm)]);
-//     }
-//     //__syncthreads();
-
-//   } else {
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward2(
-//           prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0,
-//           c_sm, f_sm)]);
-//       for (SIZE i = 1; i < r_ghost + r_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_forward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], am_sm[i],
-//             bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]);
-//       }
-//     }
-//   }
-//   __syncthreads();
-//   /* flush results to v */
-//   if (c_sm < c_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < r_ghost + r_rest; i++) {
-//       vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)] =
-//           vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)];
-//     }
-//   }
-//   __syncthreads();
-
-//   /* backward */
-//   r_rest = nr_c;
-//   r_ghost = min(nr_c, G);
-//   r_main = R;
-//   r_gl = 0;
-//   prev_vec_sm = 0.0;
-
-//   /* Load first ghost */
-//   if (c_sm < c_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < r_ghost; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-//           vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)];
-//     }
-//   }
-
-//   if (c_sm == 0 && r_sm < r_ghost) {
-//     am_sm[r_sm] = am[nr_c - (r_gl + r_sm)];
-//     bm_sm[r_sm] = bm[nr_c - (r_gl + r_sm)];
-//   }
-//   r_rest -= r_ghost;
-//   __syncthreads();
-
-//   while (r_rest > R - r_ghost) {
-//     r_main = min(R, r_rest);
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < r_main; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = vec[get_idx(
-//             ldv1, ldv2, (nr_c - 1) - (r_gl + i + r_ghost), c_sm, f_sm)];
-//       }
-//     }
-//     if (c_sm == 0 && r_sm < r_main) {
-//       am_sm[r_sm + r_ghost] = am[nr_c - (r_gl + r_sm + r_ghost)];
-//       bm_sm[r_sm + r_ghost] = bm[nr_c - (r_gl + r_sm + r_ghost)];
-//     }
-//     __syncthreads();
-
-//     /* Computation of v in parallel*/
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-//           tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0],
-//                            vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]);
-//       for (SIZE i = 1; i < R; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_backward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)],
-//             am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm,
-//             f_sm)]);
-//       }
-
-//       /* Store last v */
-//       prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, R - 1, c_sm, f_sm)];
-//     }
-//     __syncthreads();
-
-//     /* flush results to v */
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < R; i++) {
-//         vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)];
-//       }
-//     }
-//     __syncthreads();
-
-//     // /* Update unloaded col */
-//     r_rest -= r_main;
-
-//     /* Advance c */
-//     r_gl += R;
-
-//     /* Copy next ghost to main */
-//     r_ghost = min(G, r_main - (R - G));
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       for (SIZE i = 0; i < r_ghost; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-//             vec_sm[get_idx(ldsm1, ldsm2, i + R, c_sm, f_sm)];
-//       }
-//     }
-//     if (c_sm == 0 && r_sm < r_ghost) {
-//       am_sm[r_sm] = am_sm[r_sm + R];
-//       bm_sm[r_sm] = bm_sm[r_sm + R];
-//     }
-//     __syncthreads();
-
-//   } // end of while
-
-//   /* Load all rest col */
-//   if (c_sm < c_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < r_rest; i++) {
-//       vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = vec[get_idx(
-//           ldv1, ldv2, (nr_c - 1) - (r_gl + i + r_ghost), c_sm, f_sm)];
-//     }
-//   }
-//   if (c_sm == 0 && r_sm < r_rest) {
-//     am_sm[r_sm + r_ghost] = am[nr_c - (r_gl + r_sm + r_ghost)];
-//     bm_sm[r_sm + r_ghost] = bm[nr_c - (r_gl + r_sm + r_ghost)];
-//   }
-//   __syncthreads();
-
-//   /* Only 1 col remain */
-//   if (r_ghost + r_rest == 1) {
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-//           tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0],
-//                            vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]);
-//     }
-//     //__syncthreads();
-
-//   } else {
-//     if (c_sm < c_rest && f_sm < f_rest) {
-//       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-//           tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0],
-//                            vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]);
-//       for (SIZE i = 1; i < r_ghost + r_rest; i++) {
-//         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_backward2(
-//             vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)],
-//             am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm,
-//             f_sm)]);
-//       }
-//     }
-//   }
-//   __syncthreads();
-//   /* flush results to v */
-//   if (c_sm < c_rest && f_sm < f_rest) {
-//     for (SIZE i = 0; i < r_ghost + r_rest; i++) {
-//       vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)] =
-//           vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)];
-//     }
-//   }
-//   __syncthreads();
-// }
-
-// template <uint32_t D, typename T, SIZE R, SIZE C, SIZE F, SIZE G>
-// void ipk_3_3d_adaptive_launcher(Handle<D, T> &handle, SIZE nr_c, SIZE nc_c,
-//                                 SIZE nf_c, T *am, T *bm, T *ddist_r, T *dv,
-//                                 SIZE lddv1, SIZE lddv2, int queue_idx) {
-
-//   // printf("am: ");
-//   // print_matrix_cuda(1, nr, am, nr);
-//   // printf("bm: ");
-//   // print_matrix_cuda(1, nr, bm, nr);
-
-//   SIZE total_thread_x = nf_c;
-//   SIZE total_thread_y = nc_c;
-//   SIZE total_thread_z = 1;
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   tbx = F;//std::max(F, std::min(F, total_thread_x));
-//   tby = C;//std::max(C, std::min(C, total_thread_y));
-//   tbz = 1;
-//   sm_size = (C * F + 2) * (R + G) * sizeof(T);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridz = 1;
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-//   _ipk_3_3d<T, R, C, F, G><<<blockPerGrid, threadsPerBlock, sm_size,
-//                              *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       nr_c, nc_c, nf_c, am, bm, ddist_r, dv, lddv1, lddv2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <uint32_t D, typename T>
-// void ipk_3_3d(Handle<D, T> &handle, SIZE nr_c, SIZE nc_c, SIZE nf_c, T *am, T
-// *bm,
-//               T *ddist_r, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx,
-//               int config) {
-
-// #define IPK(R, C, F, G)                                                        \
-//   {                                                                            \
-//     ipk_3_3d_adaptive_launcher<D, T, R, C, F, G>(handle, nr_c, nc_c, nf_c, am, \
-//                                                  bm, ddist_r, dv, lddv1,       \
-//                                                  lddv2, queue_idx);            \
-//   }
-
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D == 3) {
-//     if (profile || config == 6) {
-//       IPK(2, 2, 128, 2)
-//     }
-//     if (profile || config == 5) {
-//       IPK(2, 2, 64, 2)
-//     }
-//     if (profile || config == 4) {
-//       IPK(2, 2, 32, 2)
-//     }
-//     if (profile || config == 3) {
-//       IPK(2, 2, 16, 2)
-//     }
-//     if (profile || config == 2) {
-//       IPK(8, 8, 8, 4)
-//     }
-//     if (profile || config == 1) {
-//       IPK(4, 4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       IPK(2, 2, 2, 2)
-//     }
-//   } else {
-//     printf("Error: ipk_3_3d is only for 3D data\n");
-//   }
-// #undef IPK
-// }
-
 } // namespace mgard_x
 
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.h
deleted file mode 100644
index 2db0cb8afa..0000000000
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D_AMR
-#define MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D_AMR
-
-#include "../../Common.h"
-#include "../../CommonInternal.h"
-
-namespace mgard_x {
-
-template <uint32_t D, typename T>
-void ipk_1_3d_amr(Handle<D, T> &handle, int nr, int nc, int nf_c, T *am, T *bm,
-                  T *ddist_f, T *dv, int lddv1, int lddv2, bool retrieve,
-                  int block_size, T *fv, int ldfv1, int ldfv2, T *bv, int ldbv1,
-                  int ldbv2, int queue_idx, int config);
-
-// template <uint32_t D, typename T>
-// void ipk_2_3d(Handle<D, T> &handle, int nr, int nc_c, int nf_c, T *am, T *bm,
-//               T *ddist_c, T *dv, int lddv1, int lddv2, int queue_idx,
-//               int config);
-
-// template <uint32_t D, typename T>
-// void ipk_3_3d(Handle<D, T> &handle, int nr_c, int nc_c, int nf_c, T *am, T
-// *bm,
-//               T *ddist_r, T *dv, int lddv1, int lddv2, int queue_idx,
-//               int config);
-
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.hpp
deleted file mode 100644
index 1edf652e08..0000000000
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.hpp
+++ /dev/null
@@ -1,1693 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D_AMR_TEMPLATE
-#define MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D_AMR_TEMPLATE
-
-#include "../../IPKFunctor.h"
-#include "../../IterativeProcessingKernel3D_AMR.h"
-namespace mgard_x {
-
-// fv has shape nc (lead dim.) * nr * nf_c / block_size
-
-template <typename T, int R, int C, int F, int G>
-__global__ void _ipk_1_3d_amr(int nr, int nc, int nf_c, T *am, T *bm, T *dist_f,
-                              T *v, int ldv1, int ldv2, bool retrieve,
-                              int block_size, T *fv, int ldfv1, int ldfv2,
-                              T *bv, int ldbv1, int ldbv2) {
-
-  int c_gl = blockIdx.x * C;
-  int r_gl = blockIdx.y * R;
-  int f_gl = threadIdx.x;
-
-  int c_sm = threadIdx.x;
-  int r_sm = threadIdx.y;
-  int f_sm = threadIdx.x;
-
-  T *vec = v + get_idx(ldv1, ldv2, r_gl, c_gl, 0);
-  T *sm = SharedMemory<T>();
-  int ldsm1 = F + G;
-  int ldsm2 = C;
-  T *vec_sm = sm;
-  T *bm_sm = sm + R * ldsm1 * ldsm2;
-  T *dist_sm = bm_sm + ldsm1;
-
-  register T prev_vec_sm = 0.0;
-
-  int c_rest = min(C, nc - blockIdx.x * C);
-  int r_rest = min(R, nr - blockIdx.y * R);
-
-  int f_rest = nf_c;
-  int f_ghost = min(nf_c, G);
-  int f_main = F;
-  int f_progress = 0;
-
-  // printf("r_sm: %d, r_rest: %d, c_sm: %d, c_rest: %d f_sm: %d, f_rest %d ,
-  // nf_c: %d\n", r_sm, r_rest, c_sm, c_rest, f_sm, f_rest, nf_c);
-
-  // printf("test %f", vec_sm[get_idx(ldsm1, ldsm2, 0, 1, 0)]);
-  /* Load first ghost */
-  if (r_sm < r_rest && f_sm < f_ghost) {
-    for (int i = 0; i < c_rest; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-          vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)];
-      // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride,
-      // i, vec_sm[i * ldsm + c_sm]);
-    }
-    if (r_sm == 0)
-      bm_sm[f_sm] = bm[f_gl];
-  }
-
-  f_rest -= f_ghost;
-  __syncthreads();
-
-  while (f_rest > F - f_ghost) {
-    // if (c_gl == 0 && c_sm == 0 && r_gl == 0 && r_sm == 0) printf("%d %d\n",
-    // f_rest, F - f_ghost);
-    f_main = min(F, f_rest);
-    if (r_sm < r_rest && f_sm < f_main) {
-      for (int i = 0; i < c_rest; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] =
-            vec[get_idx(ldv1, ldv2, r_sm, i, f_gl + f_ghost)];
-      }
-      if (r_sm == 0)
-        bm_sm[f_sm + f_ghost] = bm[f_gl + f_ghost];
-    }
-
-    __syncthreads();
-
-    /* Computation of v in parallel*/
-    if (r_sm < r_rest && c_sm < c_rest) {
-      // if (r_gl == 0 && c_gl == 0 && r_sm == 0 && c_sm == 0) printf("%f + %f *
-      // %f -> %f\n",
-      //                                               vec_sm[get_idx(ldsm1,
-      //                                               ldsm2, r_sm, c_sm, 0)],
-      //                                               prev_vec_sm, bm_sm[0],
-      //                                               vec_sm[get_idx(ldsm1,
-      //                                               ldsm2, r_sm, c_sm,
-      //                                               0)]+prev_vec_sm *
-      //                                               bm_sm[0]);
-
-      // store fv
-      if (f_progress % block_size == 0) {
-        printf("f_progress0: %d, fv: %f\n", f_progress, prev_vec_sm);
-        fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, f_progress / block_size,
-                   c_gl + c_sm)] = prev_vec_sm;
-      }
-
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward(
-          prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-
-      f_progress++;
-
-      //#pragma unroll 32
-      for (int i = 1; i < F; i++) {
-        // store fv
-        if (f_progress % block_size == 0) {
-          printf("f_progress1: %d, fv: %f\n", f_progress,
-                 vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)]);
-          fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, f_progress / block_size,
-                     c_gl + c_sm)] =
-              vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)];
-        }
-
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_forward(
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], bm_sm[i],
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]);
-
-        f_progress++;
-        // printf("calc[%d]: %f\n", i, vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm,
-        // i)]); if (r_gl == 0 && c_gl == 0)
-        //   printf("out[%d %d %d] %f\n", r_sm, c_sm, i, vec_sm[get_idx(ldsm1,
-        //   ldsm2, r_sm, c_sm, i)]);
-      }
-
-      /* Store last v */
-      prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, F - 1)];
-    }
-    __syncthreads();
-
-    /* flush results to v */
-    if (r_sm < r_rest && f_sm < F) {
-      for (int i = 0; i < c_rest; i++) {
-        // if (blockIdx.x == 0 && blockIdx.y == 0 && r_sm == 0 && i == 1) {
-        //   printf("store [%d %d %d] %f<-%f [%d %d %d]\n",
-        //     r_sm, i, f_gl, vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)],
-        //     vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)], r_sm, i, f_sm);
-        // }
-        vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)] =
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-        // if (blockIdx.x == 0 && blockIdx.y == 0 && r_sm == 0 && i == 1) {
-        //   printf("store [%d %d %d] %f<-%f [%d %d %d]\n",
-        //     r_sm, i, f_gl, vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)],
-        //     vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)], r_sm, i, f_sm);
-        // }
-      }
-    }
-    __syncthreads();
-
-    /* Update unloaded col */
-    f_rest -= f_main;
-
-    /* Advance c */
-    f_gl += F;
-
-    // f_progress += F;
-
-    /* Copy next ghost to main */
-    f_ghost = min(G, f_main - (F - G));
-    if (r_sm < r_rest && f_sm < f_ghost) {
-      for (int i = 0; i < c_rest; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + F)];
-      }
-      if (r_sm == 0)
-        bm_sm[f_sm] = bm_sm[f_sm + blockDim.x];
-    }
-    __syncthreads();
-  } // end of while
-
-  /* Load all rest col */
-  if (r_sm < r_rest && f_sm < f_rest) {
-    for (int i = 0; i < c_rest; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] =
-          vec[get_idx(ldv1, ldv2, r_sm, i, f_gl + f_ghost)];
-    }
-    if (r_sm == 0)
-      bm_sm[f_sm + f_ghost] = bm[f_gl + f_ghost];
-  }
-
-  __syncthreads();
-
-  /* Only 1 col remain */
-  if (f_ghost + f_rest == 1) {
-    if (r_sm < r_rest && c_sm < c_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-      //       __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2,
-      //       r_sm, c_sm, 0)]);
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] -= prev_vec_sm *
-      //       bm_sm[0];
-      // #endif
-      if ((f_progress) % block_size == 0) {
-        printf("extra f_progress2: %d, fv(%d %d %d): %f\n", f_progress,
-               f_progress / block_size, r_gl, c_gl, prev_vec_sm);
-        fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, f_progress / block_size,
-                   c_gl + c_sm)] = prev_vec_sm;
-      }
-
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward(
-          prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-
-      f_progress++;
-      // printf ("prev_vec_sm = %f\n", prev_vec_sm );
-      // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] );
-    }
-    //__syncthreads();
-
-  } else {
-    if (r_sm < r_rest && c_sm < c_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-      //       __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2,
-      //       r_sm, c_sm, 0)]);
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] -= prev_vec_sm *
-      //       bm_sm[0];
-      // #endif
-      if ((f_progress) % block_size == 0) {
-        printf("extra f_progress3: %d, fv: %f\n", f_progress, prev_vec_sm);
-        fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, (f_progress) / block_size,
-                   c_gl + c_sm)] = prev_vec_sm;
-      }
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward(
-          prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-
-      f_progress++;
-
-      for (int i = 1; i < f_ghost + f_rest; i++) {
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] =
-        //         __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)],
-        //         bm_sm[i],
-        //           vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]);
-        // #else
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] -=
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)] * bm_sm[i];
-        // #endif
-
-        if (f_progress % block_size == 0) {
-          printf("extra f_progress4: %d, fv: %f\n", f_progress,
-                 vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)]);
-          fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, f_progress / block_size,
-                     c_gl + c_sm)] =
-              vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)];
-        }
-
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_forward(
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], bm_sm[i],
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]);
-
-        f_progress++;
-      }
-    }
-  }
-  __syncthreads();
-  /* flush results to v */
-  if (r_sm < r_rest && f_sm < f_ghost + f_rest) {
-    for (int i = 0; i < c_rest; i++) {
-      vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)] =
-          vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-      // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] =
-      // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv +
-      // c_stride, vec[i * row_stride * lddv + c_stride]);
-    }
-  }
-  __syncthreads();
-
-  f_progress--;
-
-  /* backward */
-  T *am_sm = bm_sm;
-  f_rest = nf_c;
-  f_ghost = min(nf_c, G);
-  f_main = F;
-  f_gl = threadIdx.x;
-  prev_vec_sm = 0.0;
-
-  /* Load first ghost */
-  if (r_sm < r_rest && f_sm < f_ghost) {
-    for (int i = 0; i < c_rest; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-          vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)];
-      // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride,
-      // i, vec_sm[i * ldsm + c_sm]);
-    }
-  }
-  if (r_sm == 0) {
-    am_sm[f_sm] = am[(nf_c - 1) - f_gl];
-    dist_sm[f_sm] = dist_f[(nf_c - 1) - f_gl]; // * -1;
-  }
-  f_rest -= f_ghost;
-  __syncthreads();
-
-  while (f_rest > F - f_ghost) {
-    f_main = min(F, f_rest);
-    if (r_sm < r_rest && f_sm < f_main) {
-      for (int i = 0; i < c_rest; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] =
-            vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl - f_ghost)];
-      }
-    }
-    if (r_sm == 0) {
-      am_sm[f_sm + f_ghost] = am[(nf_c - 1) - f_gl - f_ghost];
-      dist_sm[f_sm + f_ghost] = dist_f[(nf_c - 1) - f_gl - f_ghost]; // * -1;
-    }
-    __syncthreads();
-
-    /* Computation of v in parallel*/
-    if (r_sm < r_rest && c_sm < c_rest) {
-      // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-      // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0] *
-      // prev_vec_sm) / am_sm[0]; if (r_gl == 0 && c_gl == 0 && r_sm == 0 &&
-      // c_sm == 0)
-      //     printf("(%f + %f * %f) * %f -> %f\n",
-      //             vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)],
-      //             dist_sm[0], prev_vec_sm, am_sm[0],
-      //             (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0]
-      //             * prev_vec_sm) / am_sm[0]);
-
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-      //       __fma_rn(dist_sm[0], prev_vec_sm,
-      //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]) * am_sm[0];
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-      //       (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0] *
-      //       prev_vec_sm) / am_sm[0];
-      // #endif
-
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-          tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0],
-                           vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-
-      if (f_progress > 0 && f_progress % block_size == 0) {
-        printf("f_progress5: %d, bv: %f\n", f_progress,
-               vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-        bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1,
-                   c_gl + c_sm)] = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)];
-      }
-
-      f_progress--;
-
-      //#pragma unroll 32
-      for (int i = 1; i < F; i++) {
-        // if (r_gl == 0 && c_gl == 0 && r_sm == 0 && c_sm == 0)
-        //   printf("(%f + %f * %f) * %f -> %f\n",
-        //           vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)],
-        //           dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i-1)],
-        //           (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] - dist_sm[i]
-        //           * vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i-1)]) *
-        //           am_sm[i]);
-
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] =
-        //       __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i
-        //       - 1)],
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]) * am_sm[i];
-        // #else
-        //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] =
-        //         (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] -
-        //           dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i -
-        //           1)]) / am_sm[i];
-        // #endif
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_backward(
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], dist_sm[i],
-            am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]);
-
-        if (f_progress > 0 && f_progress % block_size == 0) {
-          printf("f_progress6: %d, bv: %f\n", f_progress,
-                 vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]);
-          bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1,
-                     c_gl + c_sm)] =
-              vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)];
-        }
-
-        f_progress--;
-      }
-      /* Store last v */
-      prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, blockDim.x - 1)];
-    }
-    __syncthreads();
-
-    /* flush results to v */
-    if (r_sm < r_rest && f_sm < F) {
-      for (int i = 0; i < c_rest; i++) {
-        vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)] =
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-      }
-    }
-    __syncthreads();
-
-    /* Update unloaded col */
-    f_rest -= f_main;
-
-    /* Advance c */
-    f_gl += F;
-
-    /* Copy next ghost to main */
-    f_ghost = min(G, f_main - (F - G));
-    if (r_sm < r_rest && f_sm < f_ghost) {
-      for (int i = 0; i < c_rest; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + F)];
-      }
-      if (r_sm == 0) {
-        am_sm[f_sm] = am_sm[f_sm + F];
-        dist_sm[f_sm] = dist_sm[f_sm + F];
-      }
-    }
-    __syncthreads();
-  } // end of while
-
-  /* Load all rest col */
-  if (r_sm < r_rest && f_sm < f_rest) {
-    for (int i = 0; i < c_rest; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] =
-          vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl - f_ghost)];
-    }
-  }
-  if (r_sm == 0) {
-    am_sm[f_sm + f_ghost] = am[(nf_c - 1) - f_gl - f_ghost];
-    dist_sm[f_sm + f_ghost] = dist_f[(nf_c - 1) - f_gl - f_ghost];
-  }
-  __syncthreads();
-
-  /* Only 1 col remain */
-  if (f_ghost + f_rest == 1) {
-    if (r_sm < r_rest && c_sm < c_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-      //       __fma_rn(dist_sm[0], prev_vec_sm,
-      //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]) * am_sm[0];
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-      //       (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0] *
-      //       prev_vec_sm) / am_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-          tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0],
-                           vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-
-      if (f_progress > 0 && f_progress % block_size == 0) {
-        printf("f_progress7: %d, bv: %f\n", f_progress,
-               vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-        bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1,
-                   c_gl + c_sm)] = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)];
-      }
-
-      f_progress--;
-      // printf ("prev_vec_sm = %f\n", prev_vec_sm );
-      // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] );
-    }
-    //__syncthreads();
-
-  } else {
-    if (r_sm < r_rest && c_sm < c_rest) {
-
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-      //       __fma_rn(dist_sm[0], prev_vec_sm,
-      //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]) * am_sm[0];
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-      //       (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0] *
-      //       prev_vec_sm) / am_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-          tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0],
-                           vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-
-      if (f_progress > 0 && f_progress % block_size == 0) {
-        printf("f_progress8: %d, bv: %f\n", f_progress,
-               vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]);
-        bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1,
-                   c_gl + c_sm)] = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)];
-      }
-
-      f_progress--;
-
-      for (int i = 1; i < f_ghost + f_rest; i++) {
-        // #ifdef MGARD_X_FMA
-        //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] =
-        //       __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i
-        //       - 1)],
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]) * am_sm[i];
-        // #else
-        //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] =
-        //         (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] -
-        //           vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)] *
-        //           dist_sm[i]) / am_sm[i];
-        // #endif
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_backward(
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], dist_sm[i],
-            am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]);
-
-        if (f_progress > 0 && f_progress % block_size == 0) {
-          printf("f_progress9: %d, bv: %f\n", f_progress,
-                 vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]);
-          bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1,
-                     c_gl + c_sm)] =
-              vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)];
-        }
-
-        f_progress--;
-      }
-    }
-  }
-  __syncthreads();
-  /* flush results to v */
-  if (r_sm < r_rest && f_sm < f_ghost + f_rest) {
-    for (int i = 0; i < c_rest; i++) {
-      vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)] =
-          vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-      // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] =
-      // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv +
-      // c_stride, vec[i * row_stride * lddv + c_stride]);
-    }
-  }
-  __syncthreads();
-}
-
-template <uint32_t D, typename T, int R, int C, int F, int G>
-void ipk_1_3d_amr_adaptive_launcher(Handle<D, T> &handle, int nr, int nc,
-                                    int nf_c, T *am, T *bm, T *ddist_f, T *dv,
-                                    int lddv1, int lddv2, bool retrieve,
-                                    int block_size, T *fv, int ldfv1, int ldfv2,
-                                    T *bv, int ldbv1, int ldbv2,
-                                    int queue_idx) {
-  // std::cout << "test\n";
-
-  int total_thread_x = nc;
-  int total_thread_y = nr;
-  int total_thread_z = 1;
-  int tbx, tby, tbz, gridx, gridy, gridz;
-  dim3 threadsPerBlock, blockPerGrid;
-  size_t sm_size;
-
-  tbx = C; // std::max(C, std::min(C, total_thread_x));
-  tby = R; // std::max(R, std::min(R, total_thread_y));
-  tbz = 1;
-  sm_size = (R * C + 2) * (F + G) * sizeof(T);
-  gridx = ceil((float)total_thread_x / tbx);
-  gridy = ceil((float)total_thread_y / tby);
-  gridz = 1;
-  threadsPerBlock = dim3(F, tby, tbz);
-  blockPerGrid = dim3(gridx, gridy, gridz);
-
-  _ipk_1_3d_amr<T, R, C, F, G><<<blockPerGrid, threadsPerBlock, sm_size,
-                                 *(cudaStream_t *)handle.get(queue_idx)>>>(
-      nr, nc, nf_c, am, bm, ddist_f, dv, lddv1, lddv2, retrieve, block_size, fv,
-      ldfv1, ldfv2, bv, ldbv1, ldbv2);
-  gpuErrchk(cudaGetLastError());
-  if (handle.sync_and_check_all_kernels) {
-    gpuErrchk(cudaDeviceSynchronize());
-  }
-  // std::cout << "test\n";
-}
-
-template <uint32_t D, typename T>
-void ipk_1_3d_amr(Handle<D, T> &handle, int nr, int nc, int nf_c, T *am, T *bm,
-                  T *ddist_f, T *dv, int lddv1, int lddv2, bool retrieve,
-                  int block_size, T *fv, int ldfv1, int ldfv2, T *bv, int ldbv1,
-                  int ldbv2, int queue_idx, int config) {
-
-#define IPK(R, C, F, G)                                                        \
-  {                                                                            \
-    ipk_1_3d_amr_adaptive_launcher<D, T, R, C, F, G>(                          \
-        handle, nr, nc, nf_c, am, bm, ddist_f, dv, lddv1, lddv2, retrieve,     \
-        block_size, fv, ldfv1, ldfv2, bv, ldbv1, ldbv2, queue_idx);            \
-  }
-  bool profile = false;
-  if (handle.profile_kernels) {
-    profile = true;
-  }
-  if (D == 3) {
-    if (profile || config == 6) {
-      IPK(2, 2, 128, 2)
-    }
-    if (profile || config == 5) {
-      IPK(2, 2, 64, 2)
-    }
-    if (profile || config == 4) {
-      IPK(2, 2, 32, 2)
-    }
-    if (profile || config == 3) {
-      IPK(4, 4, 16, 4)
-    }
-    if (profile || config == 2) {
-      IPK(8, 8, 8, 4)
-    }
-    if (profile || config == 1) {
-      IPK(4, 4, 4, 4)
-    }
-    if (profile || config == 0) {
-      IPK(2, 2, 2, 2)
-    }
-  } else if (D == 2) {
-    if (profile || config == 6) {
-      IPK(1, 2, 128, 2)
-    }
-    if (profile || config == 5) {
-      IPK(1, 2, 64, 2)
-    }
-    if (profile || config == 4) {
-      IPK(1, 2, 32, 2)
-    }
-    if (profile || config == 3) {
-      IPK(1, 4, 16, 4)
-    }
-    if (profile || config == 2) {
-      IPK(1, 8, 8, 4)
-    }
-    if (profile || config == 1) {
-      IPK(1, 4, 4, 4)
-    }
-    if (profile || config == 0) {
-      IPK(1, 2, 4, 2)
-    }
-  } else if (D == 1) {
-    if (profile || config == 6) {
-      IPK(1, 1, 128, 2)
-    }
-    if (profile || config == 5) {
-      IPK(1, 1, 64, 2)
-    }
-    if (profile || config == 4) {
-      IPK(1, 1, 32, 2)
-    }
-    if (profile || config == 3) {
-      IPK(1, 1, 16, 4)
-    }
-    if (profile || config == 2) {
-      IPK(1, 1, 8, 4)
-    }
-    if (profile || config == 1) {
-      IPK(1, 1, 8, 4)
-    }
-    if (profile || config == 0) {
-      IPK(1, 1, 8, 2)
-    }
-  }
-#undef IPK
-}
-
-#if 0
-template <typename T, int R, int C, int F, int G>
-__global__ void _ipk_2_3d(int nr, int nc_c, int nf_c, T *am, T *bm, T *dist_c,
-                          T *v, int ldv1, int ldv2) {
-
-  int f_gl = blockIdx.x * F;
-  int r_gl = blockIdx.y * R;
-  int c_gl = 0;
-
-  int f_sm = threadIdx.x;
-  int r_sm = threadIdx.y;
-  int c_sm = threadIdx.x;
-
-  T *vec = v + get_idx(ldv1, ldv2, r_gl, 0, f_gl);
-  T *sm = SharedMemory<T>();
-  int ldsm1 = F;
-  int ldsm2 = C + G;
-  T *vec_sm = sm;
-  T *bm_sm = sm + R * ldsm1 * ldsm2;
-  T *dist_sm = bm_sm + ldsm2;
-
-  register T prev_vec_sm = 0.0;
-
-  int f_rest = min(F, nf_c - blockIdx.x * F);
-  int r_rest = min(R, nr - blockIdx.y * R);
-
-  // if (blockIdx.x == 1 && blockIdx.y == 0 && f_sm == 0 && r_sm == 0) {
-  //   printf("f_rest: %d r_rest: %d\n", f_rest, r_rest);
-  // }
-
-  int c_rest = nc_c;
-  int c_ghost = min(nc_c, G);
-  int c_main = C;
-
-  /* Load first ghost */
-  if (r_sm < r_rest && f_sm < f_rest) {
-    for (int i = 0; i < c_ghost; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-          vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)];
-      // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride,
-      // i, vec_sm[i * ldsm + c_sm]);
-    }
-  }
-  if (r_sm == 0 && c_sm < c_ghost)
-    bm_sm[c_sm] = bm[c_gl + c_sm];
-  c_rest -= c_ghost;
-  __syncthreads();
-
-  while (c_rest > C - c_ghost) {
-    // printf("%d %d %d\n", c_rest, C, c_ghost);
-    c_main = min(C, c_rest);
-    if (r_sm < r_rest && f_sm < f_rest) {
-      for (int i = 0; i < c_main; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] =
-            vec[get_idx(ldv1, ldv2, r_sm, c_gl + i + c_ghost, f_sm)];
-      }
-    }
-    if (r_sm == 0 && c_sm < c_main)
-      bm_sm[c_sm + c_ghost] = bm[c_gl + c_sm + c_ghost];
-    __syncthreads();
-
-    /* Computation of v in parallel*/
-    if (r_sm < r_rest && f_sm < f_rest) {
-
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] =
-      //       __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2,
-      //       r_sm, 0, f_sm)]);
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -= prev_vec_sm *
-      //       bm_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward(
-          prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]);
-
-      for (int i = 1; i < C; i++) {
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-        //       __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)],
-        //       bm_sm[i],
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-        // #else
-        //       // if (blockIdx.x == 1 && blockIdx.y == 0 && f_sm == 0 && r_sm
-        //       == 0) {
-        //       //   printf("calc: %f %f %f -> %f \n", vec_sm[get_idx(ldsm1,
-        //       ldsm2, r_sm, i, f_sm)],
-        //       //     vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)],
-        //       bm_sm[i],  vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] -
-        //       //    vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)] *
-        //       bm_sm[i]);
-        //       // }
-
-        //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] -=
-        //          vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)] * bm_sm[i];
-        // #endif
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_forward(
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], bm_sm[i],
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-      }
-      /* Store last v */
-      prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, C - 1, f_sm)];
-    }
-    __syncthreads();
-
-    /* flush results to v */
-    if (r_sm < r_rest && f_sm < f_rest) {
-      for (int i = 0; i < C; i++) {
-        // if (blockIdx.x == 1 && blockIdx.y == 0 && f_sm == 0 && r_sm == 0) {
-        //   printf("store: %f\n", vec_sm[get_idx(ldsm1, ldsm2, r_sm, i,
-        //   f_sm)]);
-        // }
-        vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-      }
-    }
-    __syncthreads();
-
-    /* Update unloaded col */
-    c_rest -= c_main;
-
-    /* Advance c */
-    c_gl += C;
-
-    /* Copy next ghost to main */
-    c_ghost = min(G, c_main - (C - G));
-    if (r_sm < r_rest && f_sm < f_rest) {
-      for (int i = 0; i < c_ghost; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + C, f_sm)];
-      }
-    }
-    if (r_sm == 0 && c_sm < c_ghost)
-      bm_sm[c_sm] = bm_sm[c_sm + C];
-    __syncthreads();
-
-  } // end of while
-
-  /* Load all rest col */
-  if (r_sm < r_rest && f_sm < f_rest) {
-    for (int i = 0; i < c_rest; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] =
-          vec[get_idx(ldv1, ldv2, r_sm, c_gl + i + c_ghost, f_sm)];
-    }
-  }
-  if (r_sm == 0 && c_sm < c_rest)
-    bm_sm[c_sm + c_ghost] = bm[c_gl + c_sm + c_ghost];
-  __syncthreads();
-
-  /* Only 1 col remain */
-  if (c_ghost + c_rest == 1) {
-    if (r_sm < r_rest && f_sm < f_rest) {
-      // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -= prev_vec_sm * bm_sm[0];
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] =
-      //       __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2,
-      //       r_sm, 0, f_sm)]);
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -= prev_vec_sm *
-      //       bm_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward(
-          prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]);
-      // printf ("prev_vec_sm = %f\n", prev_vec_sm );
-      // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] );
-    }
-    //__syncthreads();
-
-  } else {
-    if (r_sm < r_rest && f_sm < f_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] =
-      //       __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2,
-      //       r_sm, 0, f_sm)]);
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -= prev_vec_sm *
-      //       bm_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward(
-          prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]);
-      for (int i = 1; i < c_ghost + c_rest; i++) {
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-        //       __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)],
-        //       bm_sm[i],
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-        // #else
-        //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] -=
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)] * bm_sm[i];
-        // #endif
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_forward(
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], bm_sm[i],
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-      }
-    }
-  }
-  __syncthreads();
-  /* flush results to v */
-  if (r_sm < r_rest && f_sm < f_rest) {
-    for (int i = 0; i < c_ghost + c_rest; i++) {
-      vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)] =
-          vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-      // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] =
-      // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv +
-      // c_stride, vec[i * row_stride * lddv + c_stride]);
-    }
-  }
-  __syncthreads();
-
-  /* backward */
-  T *am_sm = bm_sm;
-  c_rest = nc_c;
-  c_ghost = min(nc_c, G);
-  c_main = C;
-  c_gl = 0;
-  prev_vec_sm = 0.0;
-
-  // if (f_gl+f_sm == 0 && r_gl+r_sm == 0 && idx[3] == 0) debug = false;
-  // if (debug) printf("block id: (%d %d %d) thread id: (%d %d %d)\n",
-  // blockIdx.x, blockIdx.y, blockIdx.z,
-  // threadIdx.x, threadIdx.y, threadIdx.z);
-
-  /* Load first ghost */
-  if (r_sm < r_rest && f_sm < f_rest) {
-    for (int i = 0; i < c_ghost; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-          vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)];
-      // if (debug) printf("load vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm,
-      // i, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-    }
-  }
-  if (r_sm == 0 && c_sm < c_ghost) {
-    am_sm[c_sm] = am[(nc_c - 1) - (c_gl + c_sm)];
-    dist_sm[c_sm] = dist_c[(nc_c - 1) - (c_gl + c_sm)];
-  }
-  c_rest -= c_ghost;
-  __syncthreads();
-
-  while (c_rest > C - c_ghost) {
-    // printf("%d %d %d\n", c_rest, C, c_ghost);
-    c_main = min(C, c_rest);
-    if (r_sm < r_rest && f_sm < f_rest) {
-      for (int i = 0; i < c_main; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = vec[get_idx(
-            ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i + c_ghost), f_sm)];
-        // if (debug) printf("load vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2,
-        // r_sm, i + c_ghost, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i +
-        // c_ghost, f_sm)]);
-      }
-    }
-    if (r_sm == 0 && c_sm < c_main) {
-      am_sm[c_sm + c_ghost] = am[(nc_c - 1) - (c_gl + c_sm + c_ghost)];
-      dist_sm[c_sm + c_ghost] = dist_c[(nc_c - 1) - (c_gl + c_sm + c_ghost)];
-    }
-    __syncthreads();
-
-    // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0)
-    // printf("*****test\n");
-    /* Computation of v in parallel*/
-    if (r_sm < r_rest && f_sm < f_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] =
-      //       __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2,
-      //       r_sm, 0, f_sm)]) * am_sm[0];
-      // #else
-      //       // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0)
-      //       //     printf("(%f + %f * %f) * %f -> %f\n",
-      //       //             vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)],
-      //       //             dist_sm[0], prev_vec_sm, am_sm[0],
-      //       //             (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -
-      //       dist_sm[0] * prev_vec_sm) / am_sm[0]); vec_sm[get_idx(ldsm1,
-      //       ldsm2, r_sm, 0, f_sm)] = (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0,
-      //       f_sm)] - dist_sm[0] * prev_vec_sm) / am_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] =
-          tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0],
-                           vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]);
-      // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm,
-      // 0, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]);
-
-      for (int i = 1; i < C; i++) {
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-        //       __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i,
-        //       f_sm)],
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)]) * am_sm[i];
-        // #else
-        //       // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0)
-        //       //   printf("(%f + %f * %f) * %f -> %f\n",
-        //       //             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)],
-        //       //             dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm,
-        //       i-1, f_sm)], am_sm[i],
-        //       //             (vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] -
-        //       //   dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1,
-        //       f_sm)]) / am_sm[i]);
-
-        //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-        //          (vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] -
-        //          dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1,
-        //          f_sm)]) / am_sm[i];
-
-        // #endif
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_backward(
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], dist_sm[i],
-            am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-
-        // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2,
-        // r_sm, i, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-      }
-
-      /* Store last v */
-      prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, C - 1, f_sm)];
-    }
-    __syncthreads();
-
-    /* flush results to v */
-    if (r_sm < r_rest && f_sm < f_rest) {
-      for (int i = 0; i < C; i++) {
-        vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-      }
-    }
-    __syncthreads();
-
-    /* Update unloaded col */
-    c_rest -= c_main;
-
-    /* Advance c */
-    c_gl += C;
-
-    /* Copy next ghost to main */
-    c_ghost = min(G, c_main - (C - G));
-    if (r_sm < r_rest && f_sm < f_rest) {
-      for (int i = 0; i < c_ghost; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + C, f_sm)];
-      }
-    }
-    if (r_sm == 0 && c_sm < c_ghost) {
-      am_sm[c_sm] = am_sm[c_sm + C];
-      dist_sm[c_sm] = dist_sm[c_sm + C];
-    }
-    __syncthreads();
-
-  } // end of while
-
-  // Load all rest col
-  if (r_sm < r_rest && f_sm < f_rest) {
-    for (int i = 0; i < c_rest; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = vec[get_idx(
-          ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i + c_ghost), f_sm)];
-
-      // if (debug) printf("load ec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm,
-      // i + c_ghost, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost,
-      // f_sm)]);
-    }
-  }
-  if (r_sm == 0 && c_sm < c_rest) {
-    am_sm[c_sm + c_ghost] = am[(nc_c - 1) - (c_gl + c_sm + c_ghost)];
-    dist_sm[c_sm + c_ghost] = dist_c[(nc_c - 1) - (c_gl + c_sm + c_ghost)];
-  }
-  __syncthreads();
-
-  /* Only 1 col remain */
-  if (c_ghost + c_rest == 1) {
-    if (r_sm < r_rest && f_sm < f_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] =
-      //       __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2,
-      //       r_sm, 0, f_sm)]) * am_sm[0];
-      // #else
-      //       // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0)
-      //       //     printf("(%f + %f * %f) * %f -> %f\n",
-      //       //             vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)],
-      //       //             dist_sm[0], prev_vec_sm, am_sm[0],
-      //       //             (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -
-      //       dist_sm[0] * prev_vec_sm) / am_sm[0]); vec_sm[get_idx(ldsm1,
-      //       ldsm2, r_sm, 0, f_sm)] = (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0,
-      //       f_sm)] - dist_sm[0] * prev_vec_sm) / am_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] =
-          tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0],
-                           vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]);
-      // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm,
-      // 0, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]);
-      // printf ("prev_vec_sm = %f\n", prev_vec_sm );
-      // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] );
-    }
-    //__syncthreads();
-
-  } else {
-    if (r_sm < r_rest && f_sm < f_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] =
-      //       __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2,
-      //       r_sm, 0, f_sm)]) * am_sm[0];
-      // #else
-      //       // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0)
-      //       //     printf("(%f + %f * %f) * %f -> %f\n",
-      //       //             vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)],
-      //       //             dist_sm[0], prev_vec_sm, am_sm[0],
-      //       //             (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -
-      //       dist_sm[0] * prev_vec_sm) / am_sm[0]); vec_sm[get_idx(ldsm1,
-      //       ldsm2, r_sm, 0, f_sm)] = (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0,
-      //       f_sm)] - dist_sm[0] * prev_vec_sm) / am_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] =
-          tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0],
-                           vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]);
-      // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm,
-      // 0, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]);
-      for (int i = 1; i < c_ghost + c_rest; i++) {
-
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-        //       __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i,
-        //       f_sm)],
-        //         vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)]) * am_sm[i];
-        // #else
-        //       // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0)
-        //       //   printf("(%f + %f * %f) * %f -> %f\n",
-        //       //             vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)],
-        //       //             dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm,
-        //       i-1, f_sm)], am_sm[i],
-        //       //             (vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] -
-        //       //    dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1,
-        //       f_sm)]) / am_sm[i]);
-        //        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] =
-        //          (vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] -
-        //          dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1,
-        //          f_sm)]) / am_sm[i];
-        // #endif
-        vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_backward(
-            vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], dist_sm[i],
-            am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-        // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2,
-        // r_sm, i, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]);
-      }
-    }
-  }
-  __syncthreads();
-  /* flush results to v */
-  if (r_sm < r_rest && f_sm < f_rest) {
-    for (int i = 0; i < c_ghost + c_rest; i++) {
-      vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)] =
-          vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)];
-      // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] =
-      // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv +
-      // c_stride, vec[i * row_stride * lddv + c_stride]);
-    }
-  }
-  __syncthreads();
-}
-
-template <uint32_t D, typename T, int R, int C, int F, int G>
-void ipk_2_3d_adaptive_launcher(Handle<D, T> &handle, int nr, int nc_c,
-                                int nf_c, T *am, T *bm, T *ddist_c, T *dv,
-                                int lddv1, int lddv2, int queue_idx) {
-  int total_thread_x = nf_c;
-  int total_thread_y = nr;
-  int total_thread_z = 1;
-  int tbx, tby, tbz, gridx, gridy, gridz;
-  dim3 threadsPerBlock, blockPerGrid;
-  size_t sm_size;
-
-  tbx = F;//std::max(F, std::min(F, total_thread_x));
-  tby = R;//std::max(R, std::min(R, total_thread_y));
-  tbz = 1;
-  sm_size = (R * F + 2) * (C + G) * sizeof(T);
-  gridx = ceil((float)total_thread_x / tbx);
-  gridy = ceil((float)total_thread_y / tby);
-  gridz = 1;
-  threadsPerBlock = dim3(tbx, tby, tbz);
-  blockPerGrid = dim3(gridx, gridy, gridz);
-  _ipk_2_3d<T, R, C, F, G><<<blockPerGrid, threadsPerBlock, sm_size,
-                             *(cudaStream_t *)handle.get(queue_idx)>>>(
-      nr, nc_c, nf_c, am, bm, ddist_c, dv, lddv1, lddv2);
-  gpuErrchk(cudaGetLastError());
-  if (handle.sync_and_check_all_kernels) {
-    gpuErrchk(cudaDeviceSynchronize());
-  }
-}
-
-template <uint32_t D, typename T>
-void ipk_2_3d(Handle<D, T> &handle, int nr, int nc_c, int nf_c, T *am, T *bm,
-              T *ddist_c, T *dv, int lddv1, int lddv2, int queue_idx,
-              int config) {
-
-#define IPK(R, C, F, G)                                                        \
-  {                                                                            \
-    ipk_2_3d_adaptive_launcher<D, T, R, C, F, G>(                              \
-        handle, nr, nc_c, nf_c, am, bm, ddist_c, dv, lddv1, lddv2, queue_idx); \
-  }
-  bool profile = false;
-  if (handle.profile_kernels) {
-    profile = true;
-  }
-  if (D == 3) {
-    if (profile || config == 6) {
-      IPK(2, 2, 128, 2)
-    }
-    if (profile || config == 5) {
-      IPK(2, 2, 64, 2)
-    }
-    if (profile || config == 4) {
-      IPK(2, 2, 32, 2)
-    }
-    if (profile || config == 3) {
-      IPK(4, 4, 16, 4)
-    }
-    if (profile || config == 2) {
-      IPK(8, 8, 8, 4)
-    }
-    if (profile || config == 1) {
-      IPK(4, 4, 4, 4)
-    }
-    if (profile || config == 0) {
-      IPK(2, 2, 2, 2)
-    }
-  } else if (D == 2) {
-    if (profile || config == 6) {
-      IPK(1, 2, 128, 2)
-    }
-    if (profile || config == 5) {
-      IPK(1, 2, 64, 2)
-    }
-    if (profile || config == 4) {
-      IPK(1, 2, 32, 2)
-    }
-    if (profile || config == 3) {
-      IPK(1, 4, 16, 4)
-    }
-    if (profile || config == 2) {
-      IPK(1, 8, 8, 4)
-    }
-    if (profile || config == 1) {
-      IPK(1, 4, 4, 4)
-    }
-    if (profile || config == 0) {
-      IPK(1, 2, 4, 2)
-    }
-  } else {
-    printf("Error: ipk_2_3d is only for 3D and 2D data\n");
-  }
-#undef IPK
-}
-
-template <typename T, int R, int C, int F, int G>
-__global__ void _ipk_3_3d(int nr_c, int nc_c, int nf_c, T *am, T *bm, T *dist_r,
-                          T *v, int ldv1, int ldv2) {
-
-  int f_gl = blockIdx.x * F;
-  int c_gl = blockIdx.y * C;
-  int r_gl = 0;
-
-  int f_sm = threadIdx.x;
-  int c_sm = threadIdx.y;
-  int r_sm = threadIdx.x;
-
-  T *vec = v + get_idx(ldv1, ldv2, 0, c_gl, f_gl);
-  T *sm = SharedMemory<T>();
-  int ldsm1 = F;
-  int ldsm2 = C;
-  T *vec_sm = sm;
-  T *bm_sm = sm + (R + G) * ldsm1 * ldsm2;
-  T *dist_sm = bm_sm + (R + G);
-
-  register T prev_vec_sm = 0.0;
-
-  int f_rest = min(F, nf_c - blockIdx.x * F);
-  int c_rest = min(C, nc_c - blockIdx.y * C);
-
-  int r_rest = nr_c;
-  int r_ghost = min(nr_c, G);
-  int r_main = R;
-
-  /* Load first ghost */
-  if (c_sm < c_rest && f_sm < f_rest) {
-    for (int i = 0; i < r_ghost; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-          vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)];
-      // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride,
-      // i, vec_sm[i * ldsm + c_sm]);
-    }
-  }
-
-  if (c_sm == 0 && r_sm < r_ghost)
-    bm_sm[r_sm] = bm[r_gl + r_sm];
-  r_rest -= r_ghost;
-  __syncthreads();
-
-  while (r_rest > R - r_ghost) {
-    r_main = min(R, r_rest);
-    if (c_sm < c_rest && f_sm < f_rest) {
-      for (int i = 0; i < r_main; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] =
-            vec[get_idx(ldv1, ldv2, r_gl + i + r_ghost, c_sm, f_sm)];
-        // printf("%d\n", r_gl + i + r_ghost);
-      }
-    }
-    if (c_sm == 0 && r_sm < r_main)
-      bm_sm[r_sm + r_ghost] = bm[r_gl + r_sm + r_ghost];
-    __syncthreads();
-
-    /* Computation of v in parallel*/
-    if (c_sm < c_rest && f_sm < f_rest) {
-
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-      //       __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0,
-      //       c_sm, f_sm)]);
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] -= prev_vec_sm *
-      //       bm_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward(
-          prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]);
-      for (int i = 1; i < R; i++) {
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-        //       __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)],
-        //       bm_sm[i],
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]);
-        // #else
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] -=
-        //          vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)] * bm_sm[i];
-        // #endif
-        vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_forward(
-            vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], bm_sm[i],
-            vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]);
-      }
-
-      /* Store last v */
-      prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, R - 1, c_sm, f_sm)];
-    }
-    __syncthreads();
-
-    /* flush results to v */
-    if (c_sm < c_rest && f_sm < f_rest) {
-      for (int i = 0; i < R; i++) {
-        vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)];
-      }
-    }
-    __syncthreads();
-
-    // /* Update unloaded col */
-    r_rest -= r_main;
-
-    /* Advance c */
-    r_gl += R;
-
-    /* Copy next ghost to main */
-    r_ghost = min(G, r_main - (R - G));
-    if (c_sm < c_rest && f_sm < f_rest) {
-      for (int i = 0; i < r_ghost; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, i + R, c_sm, f_sm)];
-      }
-    }
-    if (c_sm == 0 && r_sm < r_ghost)
-      bm_sm[r_sm] = bm_sm[r_sm + R];
-    __syncthreads();
-
-  } // end of while
-
-  /* Load all rest col */
-  if (c_sm < c_rest && f_sm < f_rest) {
-    for (int i = 0; i < r_rest; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] =
-          vec[get_idx(ldv1, ldv2, r_gl + i + r_ghost, c_sm, f_sm)];
-    }
-  }
-
-  if (c_sm == 0 && r_sm < r_rest)
-    bm_sm[r_sm + r_ghost] = bm[r_gl + r_sm + r_ghost];
-  __syncthreads();
-
-  /* Only 1 col remain */
-  if (r_ghost + r_rest == 1) {
-    if (c_sm < c_rest && f_sm < f_rest) {
-
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-      //       __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0,
-      //       c_sm, f_sm)]);
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] -= prev_vec_sm *
-      //       bm_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward(
-          prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]);
-      // printf ("prev_vec_sm = %f\n", prev_vec_sm );
-      // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] );
-    }
-    //__syncthreads();
-
-  } else {
-    if (c_sm < c_rest && f_sm < f_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-      //       __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0,
-      //       c_sm, f_sm)]);
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] -= prev_vec_sm *
-      //       bm_sm[0];
-      // #endif
-      vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward(
-          prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]);
-      for (int i = 1; i < r_ghost + r_rest; i++) {
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-        //       __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)],
-        //       bm_sm[i],
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]);
-        // #else
-        //       vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] -=
-        //         vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)] * bm_sm[i];
-        // #endif
-        vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_forward(
-            vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], bm_sm[i],
-            vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]);
-      }
-    }
-  }
-  __syncthreads();
-  /* flush results to v */
-  if (c_sm < c_rest && f_sm < f_rest) {
-    for (int i = 0; i < r_ghost + r_rest; i++) {
-      vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)] =
-          vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)];
-      // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] =
-      // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv +
-      // c_stride, vec[i * row_stride * lddv + c_stride]);
-    }
-  }
-  __syncthreads();
-
-  /* backward */
-  T *am_sm = bm_sm;
-  r_rest = nr_c;
-  r_ghost = min(nr_c, G);
-  r_main = R;
-  r_gl = 0;
-  prev_vec_sm = 0.0;
-
-  /* Load first ghost */
-  if (c_sm < c_rest && f_sm < f_rest) {
-    for (int i = 0; i < r_ghost; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-          vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)];
-      // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride,
-      // i, vec_sm[i * ldsm + c_sm]);
-    }
-  }
-
-  if (c_sm == 0 && r_sm < r_ghost) {
-    am_sm[r_sm] = am[(nr_c - 1) - (r_gl + r_sm)];
-    dist_sm[r_sm] = dist_r[(nr_c - 1) - (r_gl + r_sm)];
-  }
-  r_rest -= r_ghost;
-  __syncthreads();
-
-  while (r_rest > R - r_ghost) {
-    r_main = min(R, r_rest);
-    if (c_sm < c_rest && f_sm < f_rest) {
-      for (int i = 0; i < r_main; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = vec[get_idx(
-            ldv1, ldv2, (nr_c - 1) - (r_gl + i + r_ghost), c_sm, f_sm)];
-      }
-    }
-    if (c_sm == 0 && r_sm < r_main) {
-      am_sm[r_sm + r_ghost] = am[(nr_c - 1) - (r_gl + r_sm + r_ghost)];
-      dist_sm[r_sm + r_ghost] = dist_r[(nr_c - 1) - (r_gl + r_sm + r_ghost)];
-    }
-    __syncthreads();
-
-    /* Computation of v in parallel*/
-    if (c_sm < c_rest && f_sm < f_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-      //       __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2, 0,
-      //       c_sm, f_sm)]) * am_sm[0];
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-      //       (vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] - dist_sm[0] *
-      //       prev_vec_sm) / am_sm[0];
-      // #endif
-
-      vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-          tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0],
-                           vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]);
-      for (int i = 1; i < R; i++) {
-
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-        //       __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm,
-        //       f_sm)],
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]) * am_sm[i];
-        // #else
-        //       vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-        //         (vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] -
-        //          dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm,
-        //          f_sm)]) / am_sm[i];
-        // #endif
-
-        vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_backward(
-            vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], dist_sm[i],
-            am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]);
-      }
-
-      /* Store last v */
-      prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, R - 1, c_sm, f_sm)];
-    }
-    __syncthreads();
-
-    /* flush results to v */
-    if (c_sm < c_rest && f_sm < f_rest) {
-      for (int i = 0; i < R; i++) {
-        // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
-        // threadIdx.y == 0) {
-        //   printf("%d %d %d (%f) <- %d %d %d\n", (nr - 1) - (r_gl + i), c_sm,
-        //   f_sm,
-        //           vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)], i, c_sm,
-        //           f_sm);
-        // }
-        vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)];
-      }
-    }
-    __syncthreads();
-
-    // /* Update unloaded col */
-    r_rest -= r_main;
-
-    /* Advance c */
-    r_gl += R;
-
-    /* Copy next ghost to main */
-    r_ghost = min(G, r_main - (R - G));
-    if (c_sm < c_rest && f_sm < f_rest) {
-      for (int i = 0; i < r_ghost; i++) {
-        vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-            vec_sm[get_idx(ldsm1, ldsm2, i + R, c_sm, f_sm)];
-      }
-    }
-    if (c_sm == 0 && r_sm < r_ghost) {
-      am_sm[r_sm] = am_sm[r_sm + R];
-      dist_sm[r_sm] = dist_sm[r_sm + R];
-    }
-    __syncthreads();
-
-  } // end of while
-
-  /* Load all rest col */
-  if (c_sm < c_rest && f_sm < f_rest) {
-    for (int i = 0; i < r_rest; i++) {
-      vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = vec[get_idx(
-          ldv1, ldv2, (nr_c - 1) - (r_gl + i + r_ghost), c_sm, f_sm)];
-    }
-  }
-  if (c_sm == 0 && r_sm < r_rest) {
-    am_sm[r_sm + r_ghost] = am[(nr_c - 1) - (r_gl + r_sm + r_ghost)];
-    dist_sm[r_sm + r_ghost] = dist_r[(nr_c - 1) - (r_gl + r_sm + r_ghost)];
-  }
-  __syncthreads();
-
-  /* Only 1 col remain */
-  if (r_ghost + r_rest == 1) {
-    if (c_sm < c_rest && f_sm < f_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-      //       __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2, 0,
-      //       c_sm, f_sm)]) * am_sm[0];
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-      //       (vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] - dist_sm[0] *
-      //       prev_vec_sm) / am_sm[0];
-      // #endif
-      // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
-      // threadIdx.y == 0) {
-      //   printf("backward 1 (%f) %f %f %f %f\n", tridiag_backward(prev_vec_sm,
-      //   dist_sm[0], am_sm[0],
-      //     vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]), prev_vec_sm,
-      //     dist_sm[0], am_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm,
-      //     f_sm)]);
-
-      // }
-      vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-          tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0],
-                           vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]);
-      // printf ("prev_vec_sm = %f\n", prev_vec_sm );
-      // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] );
-    }
-    //__syncthreads();
-
-  } else {
-    if (c_sm < c_rest && f_sm < f_rest) {
-      // #ifdef MGARD_X_FMA
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-      //       __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2, 0,
-      //       c_sm, f_sm)]) * am_sm[0];
-      // #else
-      //       vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-      //       (vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] - dist_sm[0] *
-      //       prev_vec_sm) / am_sm[0];
-      // #endif
-      // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
-      // threadIdx.y == 0) {
-      //   printf("backward 1 (%f) %f %f %f %f\n", tridiag_backward(prev_vec_sm,
-      //   dist_sm[0], am_sm[0],
-      //     vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]), prev_vec_sm,
-      //     dist_sm[0], am_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm,
-      //     f_sm)]);
-
-      // }
-
-      vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-          tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0],
-                           vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]);
-      for (int i = 1; i < r_ghost + r_rest; i++) {
-
-        // #ifdef MGARD_X_FMA
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-        //       __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm,
-        //       f_sm)],
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]) * am_sm[i];
-        // #else
-        //         vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] =
-        //         (vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] -
-        //          dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm,
-        //          f_sm)]) / am_sm[i];
-        // #endif
-        //   if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
-        //   threadIdx.y == 0) { printf("backward R=%d (%f) %f %f %f %f\n", i,
-        //   tridiag_backward(vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)],
-        //    dist_sm[i], am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm,
-        //    f_sm)]), vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)],
-        //    dist_sm[i], am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm,
-        //    f_sm)]);
-
-        // }
-
-        vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_backward(
-            vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], dist_sm[i],
-            am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]);
-      }
-    }
-  }
-  __syncthreads();
-  /* flush results to v */
-  if (c_sm < c_rest && f_sm < f_rest) {
-    for (int i = 0; i < r_ghost + r_rest; i++) {
-      vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)] =
-          vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)];
-      // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] =
-      // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv +
-      // c_stride, vec[i * row_stride * lddv + c_stride]);
-    }
-  }
-  __syncthreads();
-}
-
-template <uint32_t D, typename T, int R, int C, int F, int G>
-void ipk_3_3d_adaptive_launcher(Handle<D, T> &handle, int nr_c, int nc_c,
-                                int nf_c, T *am, T *bm, T *ddist_r, T *dv,
-                                int lddv1, int lddv2, int queue_idx) {
-
-  // printf("am: ");
-  // print_matrix_cuda(1, nr, am, nr);
-  // printf("bm: ");
-  // print_matrix_cuda(1, nr, bm, nr);
-
-  int total_thread_x = nf_c;
-  int total_thread_y = nc_c;
-  int total_thread_z = 1;
-  int tbx, tby, tbz, gridx, gridy, gridz;
-  dim3 threadsPerBlock, blockPerGrid;
-  size_t sm_size;
-
-  tbx = F;//std::max(F, std::min(F, total_thread_x));
-  tby = C;//std::max(C, std::min(C, total_thread_y));
-  tbz = 1;
-  sm_size = (C * F + 2) * (R + G) * sizeof(T);
-  gridx = ceil((float)total_thread_x / tbx);
-  gridy = ceil((float)total_thread_y / tby);
-  gridz = 1;
-  threadsPerBlock = dim3(tbx, tby, tbz);
-  blockPerGrid = dim3(gridx, gridy, gridz);
-  _ipk_3_3d<T, R, C, F, G><<<blockPerGrid, threadsPerBlock, sm_size,
-                             *(cudaStream_t *)handle.get(queue_idx)>>>(
-      nr_c, nc_c, nf_c, am, bm, ddist_r, dv, lddv1, lddv2);
-  gpuErrchk(cudaGetLastError());
-  if (handle.sync_and_check_all_kernels) {
-    gpuErrchk(cudaDeviceSynchronize());
-  }
-}
-
-template <uint32_t D, typename T>
-void ipk_3_3d(Handle<D, T> &handle, int nr_c, int nc_c, int nf_c, T *am, T *bm,
-              T *ddist_r, T *dv, int lddv1, int lddv2, int queue_idx,
-              int config) {
-
-#define IPK(R, C, F, G)                                                        \
-  {                                                                            \
-    ipk_3_3d_adaptive_launcher<D, T, R, C, F, G>(handle, nr_c, nc_c, nf_c, am, \
-                                                 bm, ddist_r, dv, lddv1,       \
-                                                 lddv2, queue_idx);            \
-  }
-
-  bool profile = false;
-  if (handle.profile_kernels) {
-    profile = true;
-  }
-  if (D == 3) {
-    if (profile || config == 6) {
-      IPK(2, 2, 128, 2)
-    }
-    if (profile || config == 5) {
-      IPK(2, 2, 64, 2)
-    }
-    if (profile || config == 4) {
-      IPK(2, 2, 32, 2)
-    }
-    if (profile || config == 3) {
-      IPK(2, 2, 16, 2)
-    }
-    if (profile || config == 2) {
-      IPK(8, 8, 8, 4)
-    }
-    if (profile || config == 1) {
-      IPK(4, 4, 4, 4)
-    }
-    if (profile || config == 0) {
-      IPK(2, 2, 2, 2)
-    }
-  } else {
-    printf("Error: ipk_3_3d is only for 3D data\n");
-  }
-#undef IPK
-}
-#endif
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.h
deleted file mode 100644
index 3495c8a780..0000000000
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_LEVELWISE_PROCESSING_KERNEL
-#define MGARD_X_LEVELWISE_PROCESSING_KERNEL
-
-#include "Common.h"
-
-namespace mgard_x {
-
-template <DIM D, typename T, int OP>
-void lwpk(Handle<D, T> &handle, thrust::device_vector<SIZE> shape, T *dv,
-          thrust::device_vector<SIZE> ldvs, T *dwork,
-          thrust::device_vector<SIZE> ldws, int queue_idx);
-
-template <DIM D, typename T, int OP>
-void lwpk(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_d, T *dv, SIZE *ldvs,
-          T *dwork, SIZE *ldws, int queue_idx);
-
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.hpp
index c2d7f3fc77..607f2c650c 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.hpp
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.hpp
@@ -126,15 +126,22 @@ class LwpkReo : public AutoTuner<DeviceType> {
                int queue_idx) {
 
     int range_l = std::min(6, (int)std::log2(v.getShape(0)) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
+    int config = AutoTuner<DeviceType>::autoTuningTable.lwpk[prec][range_l];
+
+    while (LWPK_CONFIG[D - 1][config][0] * LWPK_CONFIG[D - 1][config][1] *
+               LWPK_CONFIG[D - 1][config][2] >
+           DeviceRuntime<DeviceType>::GetMaxNumThreadsPerTB()) {
+      config--;
+      if (config < 0) {
+        std::cout << log::log_err
+                  << "Cannot find suitable config for LwpkReo.\n";
+      }
+    }
 
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
 
-    // int config = 0;
-    int config = AutoTuner<DeviceType>::autoTuningTable.lwpk[prec][range_l];
-
 #define LWPK(CONFIG)                                                           \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
     const int R = LWPK_CONFIG[D - 1][CONFIG][0];                               \
@@ -168,117 +175,6 @@ class LwpkReo : public AutoTuner<DeviceType> {
   }
 };
 
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F, int OP>
-// __global__ void _lwpk(SIZE *shape, T *dv, SIZE *ldvs, T *dwork, SIZE *ldws) {
-
-//   size_t threadId = (threadIdx.z * (blockDim.x * blockDim.y)) +
-//                     (threadIdx.y * blockDim.x) + threadIdx.x;
-//   SIZE *sm = SharedMemory<SIZE>();
-//   SIZE *shape_sm = sm;
-//   SIZE *ldvs_sm = shape_sm + D;
-//   SIZE *ldws_sm = ldvs_sm + D;
-
-//   if (threadId < D) {
-//     shape_sm[threadId] = shape[threadId];
-//     ldvs_sm[threadId] = ldvs[threadId];
-//     ldws_sm[threadId] = ldws[threadId];
-//   }
-//   __syncthreads();
-
-//   SIZE idx[D];
-//   SIZE firstD = div_roundup(shape_sm[0], F);
-
-//   SIZE bidx = FunctorBase<DeviceType>::GetBlockIdX();
-//   idx[0] = (bidx % firstD) * F + threadIdx.x;
-
-//   // printf("firstD %d idx[0] %d\n", firstD, idx[0]);
-
-//   bidx /= firstD;
-//   if (D >= 2)
-//     idx[1] = blockIdx.y * blockDim.y + threadIdx.y;
-//   if (D >= 3)
-//     idx[2] = blockIdx.z * blockDim.z + threadIdx.z;
-
-//   for (DIM d = 3; d < D; d++) {
-//     idx[d] = bidx % shape_sm[d];
-//     bidx /= shape_sm[d];
-//   }
-//   // int z = blockIdx.z * blockDim.z + threadIdx.z;
-//   // int y = blockIdx.y * blockDim.y + threadIdx.y;
-//   // int x = blockIdx.z * blockDim.z + threadIdx.z;
-//   bool in_range = true;
-//   for (DIM d = 0; d < D; d++) {
-//     if (idx[d] >= shape_sm[d])
-//       in_range = false;
-//   }
-//   if (in_range) {
-//     // printf("%d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]);
-//     if (OP == COPY)
-//       dwork[get_idx<D>(ldws, idx)] = dv[get_idx<D>(ldvs, idx)];
-//     if (OP == ADD)
-//       dwork[get_idx<D>(ldws, idx)] += dv[get_idx<D>(ldvs, idx)];
-//     if (OP == SUBTRACT)
-//       dwork[get_idx<D>(ldws, idx)] -= dv[get_idx<D>(ldvs, idx)];
-//   }
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F, int OP>
-// void lwpk_adaptive_launcher(Handle<D, T> &handle, SIZE *shape_h, SIZE
-// *shape_d,
-//                             T *dv, SIZE *ldvs, T *dwork, SIZE *ldws,
-//                             int queue_idx) {
-
-//   SIZE total_thread_z = shape_h[2];
-//   SIZE total_thread_y = shape_h[1];
-//   SIZE total_thread_x = shape_h[0];
-//   // linearize other dimensions
-//   SIZE tbz = R;
-//   SIZE tby = C;
-//   SIZE tbx = F;
-//   SIZE gridz = ceil((float)total_thread_z / tbz);
-//   SIZE gridy = ceil((float)total_thread_y / tby);
-//   SIZE gridx = ceil((float)total_thread_x / tbx);
-//   for (DIM d = 3; d < D; d++) {
-//     gridx *= shape_h[d];
-//   }
-
-//   // printf("exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx, gridy, gridz);
-//   dim3 threadsPerBlock(tbx, tby, tbz);
-//   dim3 blockPerGrid(gridx, gridy, gridz);
-//   size_t sm_size = (D * 3) * sizeof(SIZE);
-//   _lwpk<D, T, R, C, F, OP><<<blockPerGrid, threadsPerBlock, sm_size,
-//                              *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       shape_d, dv, ldvs, dwork, ldws);
-
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T, int OP>
-// void lwpk(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_d, T *dv, SIZE
-// *ldvs,
-//           T *dwork, SIZE *ldws, int queue_idx) {
-// #define COPYLEVEL(R, C, F) \
-//   { \
-//     lwpk_adaptive_launcher<D, T, R, C, F, OP>(handle, shape_h, shape_d, dv, \
-//                                               ldvs, dwork, ldws, queue_idx);
-//                                               \
-//   }
-//   if (D >= 3) {
-//     COPYLEVEL(4, 4, 4)
-//   }
-//   if (D == 2) {
-//     COPYLEVEL(1, 4, 4)
-//   }
-//   if (D == 1) {
-//     COPYLEVEL(1, 1, 8)
-//   }
-
-// #undef COPYLEVEL
-// }
-
 template <mgard_x::DIM D, typename T, int R, int C, int F, OPTION OP,
           typename DeviceType>
 class LevelwiseCalcNDFunctor : public Functor<DeviceType> {
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.h
deleted file mode 100644
index 0d5dde4b48..0000000000
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_LINEAR_PROCESSING_KERNEL
-#define MGARD_X_LINEAR_PROCESSING_KERNEL
-
-#include "../../Common.h"
-
-namespace mgard_x {
-
-template <DIM D, typename T>
-void lpk_reo_1(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_c_h,
-               SIZE *shape_d, SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws,
-               DIM processed_n, DIM *processed_dims_h, DIM *processed_dims_d,
-               DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T *ddist_f,
-               T *dratio_f, T *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2,
-               LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2,
-               int queue_idx, int config);
-
-template <DIM D, typename T>
-void lpk_reo_2(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_c_h,
-               SIZE *shape_d, SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws,
-               DIM processed_n, DIM *processed_dims_h, DIM *processed_dims_d,
-               DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T *ddist_c,
-               T *dratio_c, T *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2,
-               LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2,
-               int queue_idx, int config);
-
-template <DIM D, typename T>
-void lpk_reo_3(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_c_h,
-               SIZE *shape_d, SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws,
-               DIM processed_n, DIM *processed_dims_h, DIM *processed_dims_d,
-               DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T *ddist_r,
-               T *dratio_r, T *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2,
-               LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2,
-               int queue_idx, int config);
-
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.hpp
index 909e5b5a2e..99bdcf2394 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.hpp
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.hpp
@@ -552,14 +552,11 @@ class Lpk1Reo : public AutoTuner<DeviceType> {
                SubArray<D, T, DeviceType> v1, SubArray<D, T, DeviceType> v2,
                SubArray<D, T, DeviceType> w, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(shape.dataHost()[curr_dim_f]) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_mr1[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.lpk1_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define LPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -572,22 +569,26 @@ class Lpk1Reo : public AutoTuner<DeviceType> {
         shape, shape_c, processed_n, processed_dims, curr_dim_r, curr_dim_c,   \
         curr_dim_f, dist_f, ratio_f, v1, v2, w, queue_idx);                    \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    LPK(0)
-    LPK(1)
-    LPK(2)
-    LPK(3)
-    LPK(4)
-    LPK(5)
-    LPK(6)
+    LPK(6) if (!ret.success) config--;
+    LPK(5) if (!ret.success) config--;
+    LPK(4) if (!ret.success) config--;
+    LPK(3) if (!ret.success) config--;
+    LPK(2) if (!ret.success) config--;
+    LPK(1) if (!ret.success) config--;
+    LPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Lpk1Reo.\n";
+      exit(-1);
+    }
 #undef LPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -1044,12 +1045,10 @@ class Lpk2Reo : public AutoTuner<DeviceType> {
         std::min(6, (int)std::log2(shape_c.dataHost()[curr_dim_f]) - 1);
     int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_mr2[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.lpk2_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define LPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -1062,22 +1061,26 @@ class Lpk2Reo : public AutoTuner<DeviceType> {
         shape, shape_c, processed_n, processed_dims, curr_dim_r, curr_dim_c,   \
         curr_dim_f, dist_c, ratio_c, v1, v2, w, queue_idx);                    \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    LPK(0)
-    LPK(1)
-    LPK(2)
-    LPK(3)
-    LPK(4)
-    LPK(5)
-    LPK(6)
+    LPK(6) if (!ret.success) config--;
+    LPK(5) if (!ret.success) config--;
+    LPK(4) if (!ret.success) config--;
+    LPK(3) if (!ret.success) config--;
+    LPK(2) if (!ret.success) config--;
+    LPK(1) if (!ret.success) config--;
+    LPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Lpk2Reo.\n";
+      exit(-1);
+    }
 #undef LPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -1547,12 +1550,10 @@ class Lpk3Reo : public AutoTuner<DeviceType> {
         std::min(6, (int)std::log2(shape_c.dataHost()[curr_dim_f]) - 1);
     int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_mr3[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.lpk3_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define LPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -1565,22 +1566,26 @@ class Lpk3Reo : public AutoTuner<DeviceType> {
         shape, shape_c, processed_n, processed_dims, curr_dim_r, curr_dim_c,   \
         curr_dim_f, dist_r, ratio_r, v1, v2, w, queue_idx);                    \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    LPK(0)
-    LPK(1)
-    LPK(2)
-    LPK(3)
-    LPK(4)
-    LPK(5)
-    LPK(6)
+    LPK(6) if (!ret.success) config--;
+    LPK(5) if (!ret.success) config--;
+    LPK(4) if (!ret.success) config--;
+    LPK(3) if (!ret.success) config--;
+    LPK(2) if (!ret.success) config--;
+    LPK(1) if (!ret.success) config--;
+    LPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Lpk3Reo.\n";
+      exit(-1);
+    }
 #undef LPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -1589,1434 +1594,6 @@ class Lpk3Reo : public AutoTuner<DeviceType> {
   }
 };
 
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// __global__ void
-// _lpk_reo_1(SIZE *shape, SIZE *shape_c, SIZE *ldvs, SIZE *ldws, DIM
-// processed_n,
-//            DIM *processed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM
-//            curr_dim_f, T *ddist_f, T *dratio_f, T *dv1, LENGTH lddv11, LENGTH
-//            lddv12, T *dv2, LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1,
-//            LENGTH lddw2) {
-
-//   // bool debug = false;
-//   // if (blockIdx.z == 0 && blockIdx.y == 1 && blockIdx.x == 1 &&
-//   // threadIdx.z == 0 && threadIdx.y == 0 ) debug = false;
-
-//   // bool debug = false;
-//   // if (threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.x == 0 ) debug =
-//   // true;
-
-//   LENGTH threadId = (threadIdx.z * (blockDim.x * blockDim.y)) +
-//                     (threadIdx.y * blockDim.x) + threadIdx.x;
-
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = F * 2 + 3;
-//   SIZE ldsm2 = C;
-//   T *v_sm = sm; sm += ldsm1 * ldsm2 * R;
-
-//   T *dist_f_sm = sm; sm += ldsm1;
-//   T *ratio_f_sm = sm; sm += ldsm1;
-
-//   SIZE * sm_size = (SIZE*)sm;
-//   SIZE *shape_sm = sm_size; sm_size += D;
-//   SIZE *shape_c_sm = sm_size; sm_size += D;
-//   SIZE *ldvs_sm = sm_size; sm_size += D;
-//   SIZE *ldws_sm = sm_size; sm_size += D;
-//   sm = (T*)sm_size;
-
-//   DIM * sm_dim = (DIM*)sm;
-//   DIM *processed_dims_sm = sm_dim; sm_dim += D;
-//   sm = (T*)sm_dim;
-
-//   SIZE idx[D];
-//   if (threadId < D) {
-//     shape_sm[threadId] = shape[threadId];
-//     shape_c_sm[threadId] = shape_c[threadId];
-//     ldvs_sm[threadId] = ldvs[threadId];
-//     ldws_sm[threadId] = ldws[threadId];
-//   }
-//   if (threadId < processed_n) {
-//     processed_dims_sm[threadId] = processed_dims[threadId];
-//   }
-//   __syncthreads();
-
-//   for (DIM d = 0; d < D; d++)
-//     idx[d] = 0;
-
-//   SIZE nr = shape_sm[curr_dim_r];
-//   SIZE nc = shape_sm[curr_dim_c];
-//   SIZE nf = shape_sm[curr_dim_f];
-//   SIZE nf_c = shape_c_sm[curr_dim_f];
-
-//   bool zero_other = true;
-//   bool PADDING = (nf % 2 == 0);
-
-//   SIZE bidx = blockIdx.x;
-//   SIZE firstD;
-//   if (nf_c % 2 == 1) {
-//     firstD = div_roundup(nf_c, blockDim.x);
-//   } else {
-//     firstD = div_roundup(nf_c, blockDim.x);
-//   }
-//   SIZE blockId = bidx % firstD;
-//   bidx /= firstD;
-
-//   for (DIM d = 0; d < D; d++) {
-//     if (d != curr_dim_r && d != curr_dim_c && d != curr_dim_f) {
-//       SIZE t = shape_sm[d];
-//       for (DIM k = 0; k < processed_n; k++) {
-//         if (d == processed_dims[k]) {
-//           t = shape_c_sm[d];
-//         }
-//       }
-//       idx[d] = bidx % t;
-//       bidx /= t;
-//       if (idx[d] >= shape_c_sm[d])
-//         zero_other = false;
-//     }
-//   }
-
-//   SIZE zero_r = shape_c_sm[curr_dim_r];
-//   SIZE zero_c = shape_c_sm[curr_dim_c];
-//   SIZE zero_f = shape_c_sm[curr_dim_f];
-
-//   if (D < 3) {
-//     nr = 1;
-//     zero_r = 1;
-//   }
-//   if (D < 2) {
-//     nc = 1;
-//     zero_c = 1;
-//   }
-
-//   LENGTH other_offset_v = get_idx<D>(ldvs_sm, idx);
-//   LENGTH other_offset_w = get_idx<D>(ldws_sm, idx);
-
-//   dv1 = dv1 + other_offset_v;
-//   dv2 = dv2 + other_offset_v;
-//   dw = dw + other_offset_w;
-
-//   // if (debug2) {
-//   //   printf("idx: %d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]);
-//   //   printf("other_offset_v: %llu\n", other_offset_v);
-//   //   printf("other_offset_w: %llu\n", other_offset_w);
-//   // }
-//   SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z;
-//   SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y;
-//   SIZE f_gl = blockId * blockDim.x + threadIdx.x;
-
-//   SIZE r_sm = threadIdx.z;
-//   SIZE c_sm = threadIdx.y;
-//   SIZE f_sm = threadIdx.x;
-
-//   SIZE actual_F = F;
-//   if (nf_c - blockId * blockDim.x < F) {
-//     actual_F = nf_c - blockId * blockDim.x;
-//   }
-
-//   // if (nf_c % 2 == 1){
-//   //   if(nf_c-1 - blockId * blockDim.x < F) { actual_F = nf_c - 1 - blockId
-//   *
-//   //   blockDim.x; }
-//   // } else {
-//   //   if(nf_c - blockId * blockDim.x < F) { actual_F = nf_c - blockId *
-//   //   blockDim.x; }
-//   // }
-
-//   // if (debug) printf("actual_F %d\n", actual_F);
-
-//   if (r_gl < nr && c_gl < nc && f_gl < nf_c) {
-//     if (zero_other && r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) {
-//       // if (debug) printf("load left vsm[%d]: 0.0\n", f_sm * 2 + 2);
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)] = 0.0;
-//     } else {
-//       // if (debug) printf("load left vsm[%d]<-dv1[%d, %d, %d]: %f\n", f_sm *
-//       2
-//       // + 2, r_gl, c_gl, f_gl, dv1[get_idx(lddv11, lddv12, r_gl, c_gl,
-//       f_gl)]); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)] =
-//           dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)];
-//     }
-
-//     if (f_sm == actual_F - 1) {
-//       if (zero_other && r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) {
-//         // if (debug) printf("load left+1 vsm[%d]: 0.0\n", actual_F * 2 + 2);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] = 0.0;
-//       } else {
-//         if (f_gl + 1 < nf_c) {
-//           // if (debug) printf("load left+1 vsm[%d]: %f\n", actual_F * 2 + 2,
-//           // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl + 1)]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] =
-//               dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl + 1)];
-//         } else {
-//           // if (debug) printf("load left+1 vsm[%d]: 0.0\n", actual_F * 2 +
-//           2); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] =
-//           0.0;
-//         }
-//       }
-//     }
-
-//     if (f_sm == 0) {
-//       // left
-//       if (zero_other && r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) {
-//         // coarse (-1)
-//         // if (debug) printf("load left-1 vsm[0]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = 0.0;
-//       } else {
-//         if (f_gl >= 1) {
-//           // other (-1)
-//           // if (debug) printf("load left-1 vsm[0]: %f\n",
-//           dv1[get_idx(lddv11,
-//           // lddv12, r_gl, c_gl, f_gl-1)]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-//               dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl - 1)];
-//         } else {
-//           // other (-1)
-//           // if (debug) printf("load left-1 vsm[0]: 0.0\n");
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = 0.0;
-//         }
-//       }
-//     }
-
-//     // right
-//     if (!PADDING) { //other = nf_c - 1
-//       if (nf_c % 2 != 0) {
-//         if (f_gl >= 1 && f_gl < nf_c) { //shift for better memory access
-//         pattern
-//           // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm *
-//           2
-//           // + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)], r_gl,
-//           // c_gl, f_gl - 1);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] =
-//               dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)];
-//         } else {
-//           // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 1);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = 0.0;
-//         }
-//       } else { // nf_c % 2 == 0, do not shift
-//         if (f_gl < nf_c - 1) {
-//           // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm *
-//           2
-//           // + 3, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//           // f_gl);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] =
-//               dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//         } else {
-//           // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 3);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = 0.0;
-//         }
-//       }
-//     } else { // PADDING other = nf_c - 2
-//       if (nf_c % 2 != 0) {
-//         if (f_gl >= 1 && f_gl < nf_c - 1) { //shift for better memory access
-//         pattern
-//           // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm *
-//           2
-//           // + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)], r_gl,
-//           // c_gl, f_gl - 1);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] =
-//               dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)];
-//         } else {
-//           // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 1);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = 0.0;
-//         }
-//       } else { // nf_c % 2 == 0
-//         if (f_gl < nf_c - 2) { // do not shift
-//           // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm *
-//           2
-//           // + 3, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//           // f_gl);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] =
-//               dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//         } else {
-//           // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 3);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = 0.0;
-//         }
-//       }
-//     }
-
-//     if (f_sm == actual_F - 1) {
-//       // right (+1)
-//       if (!PADDING) {
-//         if (nf_c % 2 != 0) {
-//           if (f_gl < nf_c - 1) {
-//             // if (debug) printf("load right+1 vsm[%d]: %f <- %d %d %d\n",
-//             // actual_F * 2 + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl,
-//             f_gl)],
-//             // r_gl, c_gl, f_gl);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] =
-//                 dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//           } else {
-//             // if (debug) printf("load right+1 vsm[%d]: 0.0\n", actual_F * 2
-//             +
-//             // 1);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = 0.0;
-//           }
-//         } else { // nf_c % 2 == 0
-//           if (f_gl >= actual_F) {
-//             // if (debug) printf("load right-1 vsm[1]: %f <- %d %d %d\n",
-//             // dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)],
-//             r_gl,
-//             // c_gl, f_gl - actual_F);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] =
-//                 dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)];
-//           } else {
-//             // if (debug) printf("load right-1 vsm[1]: 0.0\n");
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = 0.0;
-//           }
-//         }
-//       } else {
-//         if (nf_c % 2 != 0) {
-//           if (f_gl < nf_c - 2) {
-//             // if (debug) printf("actual_F(%d), load right+1 vsm[%d]: %f <-
-//             %d %d %d\n",
-//                               // actual_F, actual_F * 2 + 1,
-//                               dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)],
-//                               r_gl, c_gl, f_gl);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] =
-//                 dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//           } else {
-//             // if (debug) printf("load right+1 vsm[%d]: 0.0\n", actual_F * 2
-//             +
-//             // 1);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = 0.0;
-//           }
-//         } else { // nf_c % 2 == 0
-//           if (f_gl >= actual_F && f_gl - actual_F < nf_c - 2) {
-//             // if (debug) printf("load right-1 vsm[1]: %f <- %d %d %d\n",
-//             // dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)],
-//             r_gl,
-//             // c_gl, f_gl - actual_F);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] =
-//                 dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)];
-//           } else {
-//             // if (debug) printf("load right-1 vsm[1]: 0.0\n");
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = 0.0;
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   // if (debug)  printf("actual_F: %d\n", actual_F);
-//   if (r_sm == 0 && c_sm == 0 && f_sm < actual_F) {
-//     // if (debug) printf("blockId * F * 2 + f_sm = %d\n", blockId * F * 2 +
-//     f_sm); if (blockId * F * 2 + f_sm < nf) { // padding: num of dist == nf,
-//     non-padding: non of dist == nf - 1
-//       // if (debug) printf("load dist/ratio1[%d]: %f <- %d\n", 2 + f_sm,
-//       ddist_f[blockId * F * 2 + f_sm], blockId * F * 2 + f_sm); dist_f_sm[2 +
-//       f_sm] = ddist_f[blockId * F * 2 + f_sm]; ratio_f_sm[2 + f_sm] =
-//       dratio_f[blockId * F * 2 + f_sm];
-//     } else {
-//       // if (debug) printf("load dist/ratio1[%d]: 0.0\n", 2 + f_sm);
-//       dist_f_sm[2 + f_sm] = 0.0;
-//       ratio_f_sm[2 + f_sm] = 0.0;
-//     }
-
-//     if (blockId * F * 2 + actual_F + f_sm < nf) {
-//       // if (debug) printf("load dist/ratio2[%d]: %f <- %d\n", 2 + actual_F +
-//       f_sm, ddist_f[blockId * F * 2 + actual_F + f_sm], blockId * F * 2 +
-//       actual_F + f_sm); dist_f_sm[2 + actual_F + f_sm] =
-//           ddist_f[blockId * F * 2 + actual_F + f_sm];
-//       ratio_f_sm[2 + actual_F + f_sm] =
-//           dratio_f[blockId * F * 2 + actual_F + f_sm];
-//     } else {
-//       // if (debug) printf("load dist/ratio2[%d]: 0.0\n", 2 + actual_F +
-//       f_sm); dist_f_sm[2 + actual_F + f_sm] = 0.0; ratio_f_sm[2 + actual_F +
-//       f_sm] = 0.0;
-//     }
-//   }
-
-//   if (blockId > 0) {
-//     if (f_sm < 2) {
-//       // dist_f_sm[f_sm] = ddist_f[f_gl - 2];
-//       // ratio_f_sm[f_sm] = dratio_f[f_gl - 2];
-//       // if (debug) printf("load dist/ratio-1[%d]: %f <- %d\n", f_sm,
-//       ddist_f[blockId * F * 2 + f_sm - 2], blockId * F * 2 + f_sm - 2);
-//       dist_f_sm[f_sm] = ddist_f[blockId * F * 2 + f_sm - 2];
-//       ratio_f_sm[f_sm] = dratio_f[blockId * F * 2 + f_sm - 2];
-//     }
-//   } else {
-//     if (f_sm < 2) {
-//       // if (debug) printf("load dist/ratio-1[%d]: 0.0 <- %d\n", f_sm);
-//       dist_f_sm[f_sm] = 0.0;
-//       ratio_f_sm[f_sm] = 0.0;
-//     }
-//   }
-
-//   __syncthreads();
-
-//   if (r_gl < nr && c_gl < nc && f_gl < nf_c) {
-//     T h1 = dist_f_sm[f_sm * 2];
-//     T h2 = dist_f_sm[f_sm * 2 + 1];
-//     T h3 = dist_f_sm[f_sm * 2 + 2];
-//     T h4 = dist_f_sm[f_sm * 2 + 3];
-//     T r1 = ratio_f_sm[f_sm * 2];
-//     T r2 = ratio_f_sm[f_sm * 2 + 1];
-//     T r3 = ratio_f_sm[f_sm * 2 + 2];
-//     T r4 = 1 - r3;
-//     T a = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2)];
-//     T b = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)];
-//     T c = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)];
-//     T d = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)];
-//     T e = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 4)];
-
-//     // bool debug = false;
-//     // if (idx[3] == 0) debug = false;
-//     // if (debug) {
-//     //   printf("f_sm(%d) %f %f %f %f %f f_sm_h %f %f %f %f f_sm_r %f %f %f
-//     %f, out: %f\n",f_sm, a,b,c,d,e, h1,h2,h3,h4,r1,r2,r3,r4, mass_trans(a, b,
-//     c, d, e, h1, h2, h3, h4, r1, r2, r3, r4));
-//     // }
-
-//     // T tb = a * h1/6 + b * (h1+h2)/3 + c * h2/6;
-//     // T tc = b * h2/6 + c * (h2+h3)/3 + d * h3/6;
-//     // T td = c * h3/6 + d * (h3+h4)/3 + e * h4/6;
-
-//     // if (debug) printf("f_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc,
-//     // td, tc+tb * r1 + td * r4);
-
-//     // tc += tb * r1 + td * r4;
-
-//     dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] =
-//         mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4);
-
-//     // if (debug) printf("store[%d %d %d] %f \n", r_gl, c_gl, f_gl,
-//     //           mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4));
-
-//     // printf("test block %d F %d nf %d\n", blockId, F, nf);
-//     // if (f_gl+1 == nf_c-1) {
-
-//     //     // T te = h4 * d + 2 * h4 * e;
-//     //     //printf("f_sm(%d) mm-e: %f\n", f_sm, te);
-//     //     // te += td * r3;
-//     //     dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl+1)] =
-//     //       mass_trans(c, d, e, (T)0.0, (T)0.0, h1, h2, (T)0.0, (T)0.0, r1,
-//     r2,
-//     //       (T)0.0, (T)0.0);
-//     // }
-//   }
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// void lpk_reo_1_adaptive_launcher(Handle<D, T> &handle, SIZE *shape_h,
-//                                  SIZE *shape_c_h, SIZE *shape_d, SIZE
-//                                  *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM
-//                                  processed_n, DIM *processed_dims_h, DIM
-//                                  *processed_dims_d, DIM curr_dim_r, DIM
-//                                  curr_dim_c, DIM curr_dim_f, T *ddist_f, T
-//                                  *dratio_f, T *dv1, LENGTH lddv11, LENGTH
-//                                  lddv12, T *dv2, LENGTH lddv21, LENGTH
-//                                  lddv22, T *dw, LENGTH lddw1, LENGTH lddw2,
-//                                  int queue_idx) {
-//   SIZE nr = shape_h[curr_dim_r];
-//   SIZE nc = shape_h[curr_dim_c];
-//   SIZE nf = shape_h[curr_dim_f];
-//   SIZE nf_c = shape_c_h[curr_dim_f];
-
-//   SIZE total_thread_z = nr;
-//   SIZE total_thread_y = nc;
-//   SIZE total_thread_x = nf_c;
-//   // if (nf_c % 2 == 1) { total_thread_x = nf_c - 1; }
-//   // else { total_thread_x = nf_c; }
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = (R * C * (F * 2 + 3) + (F * 2 + 3) * 2) * sizeof(T);
-//   sm_size += (D * 4) * sizeof(SIZE);
-//   sm_size += (D * 1) * sizeof(DIM);
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   for (DIM d = 0; d < D; d++) {
-//     if (d != curr_dim_f && d != curr_dim_c && d != curr_dim_r) {
-//       SIZE t = shape_h[d];
-//       for (DIM k = 0; k < processed_n; k++) {
-//         if (d == processed_dims_h[k]) {
-//           t = shape_c_h[d];
-//         }
-//       }
-//       gridx *= t;
-//     }
-//   }
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-
-//   // printf("lpk_reo_1 exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz,
-//   // gridx, gridy, gridz);
-//   _lpk_reo_1<D, T, R, C, F><<<blockPerGrid, threadsPerBlock, sm_size,
-//                               *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       shape_d, shape_c_d, ldvs, ldws, processed_n, processed_dims_d,
-//       curr_dim_r, curr_dim_c, curr_dim_f, ddist_f, dratio_f, dv1, lddv11,
-//       lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void lpk_reo_1(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE
-// *shape_d,
-//                SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n,
-//                DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r,
-//                DIM curr_dim_c, DIM curr_dim_f, T *ddist_f, T *dratio_f, T
-//                *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2, LENGTH lddv21,
-//                LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, int
-//                queue_idx, int config) {
-//   #define LPK(R, C, F) \
-//   { \
-//     lpk_reo_1_adaptive_launcher<D, T, R, C, F>( \
-//         handle, shape_h, shape_c_h, shape_d, shape_c_d, ldvs, ldws, \
-//         processed_n, processed_dims_h, processed_dims_d, curr_dim_r, \
-//         curr_dim_c, curr_dim_f, ddist_f, dratio_f, dv1, lddv11, lddv12, dv2,
-//         \
-//         lddv21, lddv22, dw, lddw1, lddw2, queue_idx); \
-//   }
-
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D >= 3) {
-//     if (profile || config == 6) {
-//       LPK(2, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(2, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(2, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(4, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(8, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(2, 2, 2)
-//     }
-//   } else if (D == 2) {
-//     if (profile || config == 6) {
-//       LPK(1, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(1, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(1, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(1, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(1, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(1, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(1, 2, 4)
-//     }
-//   } else if (D == 1) {
-//     if (profile || config == 6) {
-//       LPK(1, 1, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(1, 1, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(1, 1, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(1, 1, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(1, 1, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(1, 1, 8)
-//     }
-//     if (profile || config == 0) {
-//       LPK(1, 1, 8)
-//     }
-//   }
-
-//   #undef LPK
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// __global__ void
-// _lpk_reo_2(SIZE *shape, SIZE *shape_c, SIZE *ldvs, SIZE *ldws, DIM
-// processed_n,
-//            DIM *processed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM
-//            curr_dim_f, T *ddist_c, T *dratio_c, T *dv1, LENGTH lddv11, LENGTH
-//            lddv12, T *dv2, LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1,
-//            LENGTH lddw2) {
-
-//   // bool debug = false;
-//   // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0 &&
-//   // threadIdx.z == 0 && threadIdx.x == 0 ) debug = false;
-
-//   // bool debug2 = false;
-//   // if (threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.x == 0 ) debug2 =
-//   // false;
-
-//   LENGTH threadId = (threadIdx.z * (blockDim.x * blockDim.y)) +
-//                     (threadIdx.y * blockDim.x) + threadIdx.x;
-
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = F;
-//   SIZE ldsm2 = C * 2 + 3;
-//   T *v_sm = sm; sm += ldsm1 * ldsm2 * R;
-
-//   T *dist_c_sm = sm; sm += ldsm2;
-//   T *ratio_c_sm = sm; sm += ldsm2;
-
-//   SIZE * sm_size = (SIZE*)sm;
-//   SIZE *shape_sm = sm_size; sm_size += D;
-//   SIZE *shape_c_sm = sm_size; sm_size += D;
-//   SIZE *ldvs_sm = sm_size; sm_size += D;
-//   SIZE *ldws_sm = sm_size; sm_size += D;
-//   sm = (T*)sm_size;
-
-//   DIM * sm_dim = (DIM*)sm;
-//   DIM *processed_dims_sm = sm_dim; sm_dim += D;
-//   sm = (T*)sm_dim;
-
-//   SIZE idx[D];
-//   if (threadId < D) {
-//     shape_sm[threadId] = shape[threadId];
-//     shape_c_sm[threadId] = shape_c[threadId];
-//     ldvs_sm[threadId] = ldvs[threadId];
-//     ldws_sm[threadId] = ldws[threadId];
-//   }
-//   if (threadId < processed_n) {
-//     processed_dims_sm[threadId] = processed_dims[threadId];
-//   }
-//   __syncthreads();
-
-//   for (DIM d = 0; d < D; d++)
-//     idx[d] = 0;
-
-//   SIZE nr = shape_sm[curr_dim_r];
-//   SIZE nc = shape_sm[curr_dim_c];
-//   SIZE nf_c = shape_c_sm[curr_dim_f];
-//   SIZE nc_c = shape_c_sm[curr_dim_c];
-//   bool PADDING = (nc % 2 == 0);
-
-//   if (D < 3) {
-//     nr = 1;
-//   }
-
-//   SIZE bidx = blockIdx.x;
-//   SIZE firstD = div_roundup(nf_c, blockDim.x);
-//   SIZE blockId_f = bidx % firstD;
-//   bidx /= firstD;
-
-//   for (DIM d = 0; d < D; d++) {
-//     if (d != curr_dim_r && d != curr_dim_c && d != curr_dim_f) {
-//       SIZE t = shape_sm[d];
-//       for (DIM k = 0; k < processed_n; k++) {
-//         if (d == processed_dims[k]) {
-//           t = shape_c_sm[d];
-//         }
-//       }
-//       idx[d] = bidx % t;
-//       bidx /= t;
-//     }
-//   }
-
-//   LENGTH other_offset_v = get_idx<D>(ldvs_sm, idx);
-//   LENGTH other_offset_w = get_idx<D>(ldws_sm, idx);
-
-//   dv1 = dv1 + other_offset_v;
-//   dv2 = dv2 + other_offset_v;
-//   dw = dw + other_offset_w;
-
-//   // if (debug2) {
-//   //   printf("idx: %d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]);
-//   //   printf("other_offset_v: %llu\n", other_offset_v);
-//   //   printf("other_offset_w: %llu\n", other_offset_w);
-//   // }
-
-//   SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z;
-//   SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y;
-//   SIZE f_gl = blockId_f * blockDim.x + threadIdx.x;
-
-//   SIZE blockId = blockIdx.y;
-
-//   SIZE r_sm = threadIdx.z;
-//   SIZE c_sm = threadIdx.y;
-//   SIZE f_sm = threadIdx.x;
-
-//   SIZE actual_C = C;
-//   if (nc_c - blockIdx.y * blockDim.y < C) {
-//     actual_C = nc_c - blockIdx.y * blockDim.y;
-//   }
-
-//   // if (nc_c % 2 == 1){
-//   //   if(nc_c-1 - blockIdx.y * blockDim.y < C) { actual_C = nc_c - 1 -
-//   //   blockIdx.y * blockDim.y; }
-//   // } else {
-//   //   if(nc_c - blockIdx.y * blockDim.y < C) { actual_C = nc_c - blockIdx.y
-//   *
-//   //   blockDim.y; }
-//   // }
-
-//   // bool debug = false;
-//   // if (idx[3] == 0 && r_gl == 0 ) debug = false;
-
-//   // if (debug) printf("actual_C %d\n", actual_C);
-
-//   if (r_gl < nr && c_gl < nc_c && f_gl < nf_c) {
-//     // if (debug) printf("load up vsm[%d]: %f <- %d %d %d\n", c_sm * 2 + 2,
-//     // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl);
-//     v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 2, f_sm)] =
-//         dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)];
-
-//     if (c_sm == actual_C - 1) {
-//       if (c_gl + 1 < nc_c) {
-//         // if (debug) printf("load up+1 vsm[%d]: %f <- %d %d %d\n", actual_C
-//         * 2
-//         // + 2, dv1[get_idx(lddv11, lddv12, r_gl, blockId * C + actual_C,
-//         // f_gl)], r_gl, blockId * C + actual_C, f_gl);
-//         // c_gl+1 == blockId * C + C
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, actual_C * 2 + 2, f_sm)] =
-//             dv1[get_idx(lddv11, lddv12, r_gl, c_gl + 1, f_gl)];
-//       } else {
-//         // if (debug) printf("load up+1 vsm[%d]: 0.0\n", actual_C * 2 + 2);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, actual_C * 2 + 2, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (c_sm == 0) {
-//       if (c_gl >= 1) {
-//         // if (debug) printf("load up-1 vsm[0]: %f <- %d %d %d\n",
-//         // dv1[get_idx(lddv11, lddv12, r_gl, c_gl-1, f_gl)], r_gl, c_gl-1,
-//         // f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] =
-//             dv1[get_idx(lddv11, lddv12, r_gl, c_gl - 1, f_gl)];
-//       } else {
-//         // if (debug) printf("load up-1 vsm[0]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (!PADDING) {
-//       if (c_gl < nc_c - 1) {
-//         // if (debug) printf("load down vsm[%d]: %f <- %d %d %d\n", c_sm * 2
-//         +
-//         // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] =
-//             dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load down vsm[%d]: 0.0\n", c_sm * 2 + 3);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = 0.0;
-//       }
-//     } else {
-//       if (c_gl < nc_c - 2) {
-//         // if (debug) printf("load down vsm[%d]: %f <- %d %d %d\n", c_sm * 2
-//         +
-//         // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] =
-//             dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load down vsm[%d]: 0.0\n", c_sm * 2 + 3);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (c_gl >= 1 &&
-//         (PADDING && c_gl - 1 < nc_c - 2 || !PADDING && c_gl - 1 < nc_c - 1))
-//         {
-//       if (c_sm == 0) {
-//         // if (debug) printf("load down-1 vsm[1]: %f <- %d %d %d\n",
-//         // dv2[get_idx(lddv11, lddv12, r_gl, c_gl-1, f_gl)], r_gl, c_gl-1,
-//         // f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, 1, f_sm)] =
-//             dv2[get_idx(lddv11, lddv12, r_gl, c_gl - 1, f_gl)];
-//       }
-//     } else {
-//       if (c_sm == 0) {
-//         // if (debug) printf("load down-1 vsm[1]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, 1, f_sm)] = 0.0;
-//       }
-//     }
-//   }
-
-//   // load dist/ratio using f_sm for better performance
-//   // assumption F >= C
-//   if (r_sm == 0 && c_sm == 0 && f_sm < actual_C) {
-//     if (blockId * C * 2 + f_sm < nc) {
-//       dist_c_sm[2 + f_sm] = ddist_c[blockId * C * 2 + f_sm];
-//       ratio_c_sm[2 + f_sm] = dratio_c[blockId * C * 2 + f_sm];
-//     } else {
-//       dist_c_sm[2 + f_sm] = 0.0;
-//       ratio_c_sm[2 + f_sm] = 0.0;
-//     }
-
-//     if (blockId * C * 2 + actual_C + f_sm < nc) {
-//       dist_c_sm[2 + actual_C + f_sm] =
-//           ddist_c[blockId * C * 2 + actual_C + f_sm];
-//       ratio_c_sm[2 + actual_C + f_sm] =
-//           dratio_c[blockId * C * 2 + actual_C + f_sm];
-//     } else {
-//       dist_c_sm[2 + actual_C + f_sm] = 0.0;
-//       ratio_c_sm[2 + actual_C + f_sm] = 0.0;
-//     }
-//   }
-
-//   if (blockId > 0) {
-//     if (f_sm < 2) {
-//       dist_c_sm[f_sm] = ddist_c[blockId * C * 2 - 2 + f_sm];
-//       ratio_c_sm[f_sm] = dratio_c[blockId * C * 2 - 2 + f_sm];
-//     }
-//   } else {
-//     if (f_sm < 2) {
-//       dist_c_sm[f_sm] = 0.0;
-//       ratio_c_sm[f_sm] = 0.0;
-//     }
-//   }
-
-//   __syncthreads();
-
-//   if (r_gl < nr && c_gl < nc_c && f_gl < nf_c) {
-//     T h1 = dist_c_sm[c_sm * 2];
-//     T h2 = dist_c_sm[c_sm * 2 + 1];
-//     T h3 = dist_c_sm[c_sm * 2 + 2];
-//     T h4 = dist_c_sm[c_sm * 2 + 3];
-//     T r1 = ratio_c_sm[c_sm * 2];
-//     T r2 = ratio_c_sm[c_sm * 2 + 1];
-//     T r3 = ratio_c_sm[c_sm * 2 + 2];
-//     T r4 = 1 - r3;
-//     T a = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2, f_sm)];
-//     T b = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 1, f_sm)];
-//     T c = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 2, f_sm)];
-//     T d = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)];
-//     T e = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 4, f_sm)];
-
-//     // if (debug) {
-//     //   printf("c_sm(%d) %f %f %f %f %f\n",c_sm, a,b,c,d,e);
-//     //   printf("c_sm_h(%d) %f %f %f %f\n",c_sm, h1,h2,h3,h4);
-//     //   printf("c_sm_r(%d) %f %f %f %f\n",c_sm, r1,r2,r3,r4);
-//     // }
-
-//     // T tb = a * h1 + b * 2 * (h1+h2) + c * h2;
-//     // T tc = b * h2 + c * 2 * (h2+h3) + d * h3;
-//     // T td = c * h3 + d * 2 * (h3+h4) + e * h4;
-
-//     // if (debug) printf("c_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc,
-//     // td, tc+tb * r1 + td * r4);
-
-//     // tc += tb * r1 + td * r4;
-
-//     // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) {
-//     //   printf("mr2(%d) mm2: %f -> (%d %d %d)\n", c_sm, tc, r_gl, c_gl,
-//     f_gl);
-//     //   // printf("f_sm(%d) b c d: %f %f %f\n", f_sm, tb, tc, td);
-//     // }
-
-//     // if (debug) {
-//     //   printf("f_sm(%d) %f %f %f %f %f f_sm_h %f %f %f %f f_sm_r %f %f %f
-//     %f, out: %f\n",f_sm, a,b,c,d,e, h1,h2,h3,h4,r1,r2,r3,r4, mass_trans(a, b,
-//     c, d, e, h1, h2, h3, h4, r1, r2, r3, r4));
-//     // }
-
-//     dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] =
-//         mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4);
-
-//     // if (debug) printf("store[%d %d %d] %f \n", r_gl, c_gl, f_gl,
-//     //           mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4));
-
-//     // printf("%d %d %d\n", r_gl, c_gl, f_gl);
-//     // if (blockId * C + C == nc-1) {
-//     // if (c_gl + 1 == nc_c - 1) {
-//     //   // T te = h4 * d + 2 * h4 * e;
-//     //   // te += td * r3;
-//     //   dw[get_idx(lddw1, lddw2, r_gl, blockId * C + actual_C, f_gl)] =
-//     //     mass_trans(c, d, e, (T)0.0, (T)0.0,
-//     //       h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0);
-//     // }
-//     // }
-//   }
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// void lpk_reo_2_adaptive_launcher(Handle<D, T> &handle, SIZE *shape_h,
-//                                  SIZE *shape_c_h, SIZE *shape_d, SIZE
-//                                  *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM
-//                                  processed_n, DIM *processed_dims_h, DIM
-//                                  *processed_dims_d, DIM curr_dim_r, DIM
-//                                  curr_dim_c, DIM curr_dim_f, T *ddist_c, T
-//                                  *dratio_c, T *dv1, LENGTH lddv11, LENGTH
-//                                  lddv12, T *dv2, LENGTH lddv21, LENGTH
-//                                  lddv22, T *dw, LENGTH lddw1, LENGTH lddw2,
-//                                  int queue_idx) {
-
-//   SIZE nr = shape_h[curr_dim_r];
-//   SIZE nc = shape_h[curr_dim_c];
-//   SIZE nf = shape_h[curr_dim_f];
-//   SIZE nc_c = shape_c_h[curr_dim_c];
-//   SIZE nf_c = shape_c_h[curr_dim_f];
-
-//   SIZE total_thread_z = nr;
-//   SIZE total_thread_y = nc_c;
-//   // if (nc_c % 2 == 1) { total_thread_y = nc_c - 1; }
-//   // else { total_thread_y = nc_c; }
-//   SIZE total_thread_x = nf_c;
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = (R * (C * 2 + 3) * F + (C * 2 + 3) * 2) * sizeof(T);
-//   sm_size += (D * 4) * sizeof(SIZE);
-//   sm_size += (D * 1) * sizeof(DIM);
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   for (DIM d = 0; d < D; d++) {
-//     if (d != curr_dim_f && d != curr_dim_c && d != curr_dim_r) {
-//       SIZE t = shape_h[d];
-//       for (DIM k = 0; k < processed_n; k++) {
-//         if (d == processed_dims_h[k]) {
-//           t = shape_c_h[d];
-//         }
-//       }
-//       gridx *= t;
-//     }
-//   }
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-//   // printf("lpk_reo_2 exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz,
-//   // gridx, gridy, gridz);
-
-//   _lpk_reo_2<D, T, R, C, F><<<blockPerGrid, threadsPerBlock, sm_size,
-//                               *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       shape_d, shape_c_d, ldvs, ldws, processed_n, processed_dims_d,
-//       curr_dim_r, curr_dim_c, curr_dim_f, ddist_c, dratio_c, dv1, lddv11,
-//       lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void lpk_reo_2(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE
-// *shape_d,
-//                SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n,
-//                DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r,
-//                DIM curr_dim_c, DIM curr_dim_f, T *ddist_c, T *dratio_c, T
-//                *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2, LENGTH lddv21,
-//                LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, int
-//                queue_idx, int config) {
-
-//   #define LPK(R, C, F)                                                           \
-//   {                                                                            \
-//     lpk_reo_2_adaptive_launcher<D, T, R, C, F>(                                \
-//         handle, shape_h, shape_c_h, shape_d, shape_c_d, ldvs, ldws,            \
-//         processed_n, processed_dims_h, processed_dims_d,\ 
-//                                curr_dim_r,                                     \
-//         curr_dim_c, curr_dim_f, ddist_c, dratio_c, dv1, lddv11, lddv12, dv2,   \
-//         lddv21, lddv22, dw, lddw1, lddw2, queue_idx);                          \
-//   }
-
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D >= 3) {
-//     if (profile || config == 6) {
-//       LPK(2, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(2, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(2, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(4, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(8, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(2, 2, 2)
-//     }
-//   } else if (D == 2) {
-//     if (profile || config == 6) {
-//       LPK(1, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(1, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(1, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(1, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(1, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(1, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(1, 2, 4)
-//     }
-//   } else {
-//     printf("Error: mass_trans_multiply_2_cpt is only for 3D and 2D data\n");
-//   }
-//   #undef LPK
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// __global__ void
-// _lpk_reo_3(SIZE *shape, SIZE *shape_c, SIZE *ldvs, SIZE *ldws, DIM
-// processed_n,
-//            DIM *processed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM
-//            curr_dim_f, T *ddist_r, T *dratio_r, T *dv1, LENGTH lddv11, LENGTH
-//            lddv12, T *dv2, LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1,
-//            LENGTH lddw2) {
-
-//   // bool debug = false;
-//   // if (blockIdx.z == gridDim.z-1 && blockIdx.y == 0 && blockIdx.x == 0 &&
-//   // threadIdx.y == 0 && threadIdx.x == 0 ) debug = false;
-
-//   // bool debug2 = false;
-//   // if (blockIdx.z == gridDim.z-1 && blockIdx.y == 1 && blockIdx.x == 16)
-//   // debug2 = false;
-
-//   LENGTH threadId = (threadIdx.z * (blockDim.x * blockDim.y)) +
-//                     (threadIdx.y * blockDim.x) + threadIdx.x;
-
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = F;
-//   SIZE ldsm2 = C;
-//   T *v_sm = sm; sm += ldsm1 * ldsm2 * (R * 2 + 3);
-
-//   T *dist_r_sm = sm; sm += (R * 2 + 3);
-//   T *ratio_r_sm = sm; sm += (R * 2 + 3);
-
-//   SIZE * sm_size = (SIZE*)sm;
-//   SIZE *shape_sm = sm_size; sm_size += D;
-//   SIZE *shape_c_sm = sm_size; sm_size += D;
-//   SIZE *ldvs_sm = sm_size; sm_size += D;
-//   SIZE *ldws_sm = sm_size; sm_size += D;
-//   sm = (T*)sm_size;
-
-//   DIM * sm_dim = (DIM*)sm;
-//   DIM *processed_dims_sm = sm_dim; sm_dim += D;
-//   sm = (T*)sm_dim;
-
-//   SIZE idx[D];
-//   if (threadId < D) {
-//     shape_sm[threadId] = shape[threadId];
-//     shape_c_sm[threadId] = shape_c[threadId];
-//     ldvs_sm[threadId] = ldvs[threadId];
-//     ldws_sm[threadId] = ldws[threadId];
-//   }
-//   if (threadId < processed_n) {
-//     processed_dims_sm[threadId] = processed_dims[threadId];
-//   }
-//   __syncthreads();
-
-//   for (DIM d = 0; d < D; d++)
-//     idx[d] = 0;
-
-//   SIZE nr = shape_sm[curr_dim_r];
-//   SIZE nf_c = shape_c_sm[curr_dim_f];
-//   SIZE nc_c = shape_c_sm[curr_dim_c];
-//   SIZE nr_c = shape_c_sm[curr_dim_r];
-//   bool PADDING = (nr % 2 == 0);
-
-//   SIZE bidx = blockIdx.x;
-//   SIZE firstD = div_roundup(nf_c, blockDim.x);
-//   SIZE blockId_f = bidx % firstD;
-//   bidx /= firstD;
-
-//   for (DIM d = 0; d < D; d++) {
-//     if (d != curr_dim_r && d != curr_dim_c && d != curr_dim_f) {
-//       SIZE t = shape_sm[d];
-//       for (DIM k = 0; k < processed_n; k++) {
-//         if (d == processed_dims[k]) {
-//           t = shape_c_sm[d];
-//         }
-//       }
-//       idx[d] = bidx % t;
-//       bidx /= t;
-//     }
-//   }
-
-//   LENGTH other_offset_v = get_idx<D>(ldvs_sm, idx);
-//   LENGTH other_offset_w = get_idx<D>(ldws_sm, idx);
-
-//   dv1 = dv1 + other_offset_v;
-//   dv2 = dv2 + other_offset_v;
-//   dw = dw + other_offset_w;
-
-//   // if (debug2) {
-//   //   printf("idx: %d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]);
-//   //   printf("other_offset_v: %llu\n", other_offset_v);
-//   //   printf("other_offset_w: %llu\n", other_offset_w);
-//   // }
-
-//   SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z;
-//   SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y;
-//   SIZE f_gl = blockId_f * blockDim.x + threadIdx.x;
-
-//   SIZE blockId = blockIdx.z;
-
-//   SIZE r_sm = threadIdx.z;
-//   SIZE c_sm = threadIdx.y;
-//   SIZE f_sm = threadIdx.x;
-
-//   SIZE actual_R = R;
-//   if (nr_c - blockIdx.z * blockDim.z < R) {
-//     actual_R = nr_c - blockIdx.z * blockDim.z;
-//   }
-//   // if (nr_c % 2 == 1){
-//   //   if(nr_c-1 - blockIdx.z * blockDim.z < R) { actual_R = nr_c - 1 -
-//   //   blockIdx.z * blockDim.z; }
-//   // } else {
-//   //   if(nr_c - blockIdx.z * blockDim.z < R) { actual_R = nr_c - blockIdx.z
-//   *
-//   //   blockDim.z; }
-//   // }
-
-//   // if (debug) printf("actual_R %d\n", actual_R);
-
-//   // bool debug = false;
-//   // if (idx[3] == 0 && idx[2] == 0  && f_gl == 2 && c_gl == 1) debug =
-//   false;
-
-//   // if (debug) printf("RCF: %d %d %d\n", R, C, F);
-//   if (r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//     // if (debug) printf("load front vsm[%d]: %f <- %d %d %d\n", r_sm * 2 +
-//     2,
-//     // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl);
-//     v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 2, c_sm, f_sm)] =
-//         dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)];
-
-//     if (r_sm == actual_R - 1) {
-//       if (r_gl + 1 < nr_c) {
-//         // if (debug) printf("load front+1 vsm[%d]: %f <- %d %d %d\n",
-//         actual_R
-//         // * 2 + 2, dv1[get_idx(lddv11, lddv12, blockId * R + actual_R, c_gl,
-//         // f_gl)], blockId * R + actual_R, c_gl, f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, actual_R * 2 + 2, c_sm, f_sm)] =
-//             dv1[get_idx(lddv11, lddv12, r_gl + 1, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load front+1 vsm[%d]: 0.0\n", actual_R * 2 +
-//         2); v_sm[get_idx(ldsm1, ldsm2, actual_R * 2 + 2, c_sm, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (r_sm == 0) {
-//       if (r_gl >= 1) {
-//         // if (debug) printf("load front-1 vsm[0]: %f <- %d %d %d\n",
-//         // dv1[get_idx(lddv11, lddv12, r_gl-1, c_gl, f_gl)], r_gl-1, c_gl,
-//         // f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-//             dv1[get_idx(lddv11, lddv12, r_gl - 1, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load front-1 vsm[0]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (!PADDING) {
-//       if (r_gl < nr_c - 1) {
-//         // if (debug) printf("load back vsm[%d]: 0.0\n", r_sm * 2 + 3);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] =
-//             dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load back vsm[%d]: %f <- %d %d %d\n", r_sm * 2
-//         +
-//         // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = 0.0;
-//       }
-//     } else {
-//       if (r_gl < nr_c - 2) {
-//         // if (debug) printf("load back vsm[%d]: %f <- %d %d %d\n", r_sm * 2
-//         +
-//         // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] =
-//             dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load back vsm[%d]: 0.0\n", r_sm * 2 + 3);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (r_gl >= 1 &&
-//         (PADDING && r_gl - 1 < nr_c - 2 || !PADDING && r_gl - 1 < nr_c - 1))
-//         {
-//       // if (blockId > 0) {
-//       if (r_sm == 0) {
-//         // if (debug) printf("load back-1 vsm[1]: %f <- %d %d %d\n",
-//         // dv2[get_idx(lddv11, lddv12, r_gl-1, c_gl, f_gl)], r_gl-1, c_gl,
-//         // f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, 1, c_sm, f_sm)] =
-//             dv2[get_idx(lddv11, lddv12, r_gl - 1, c_gl, f_gl)];
-//       }
-//     } else {
-//       if (r_sm == 0) {
-//         // if (debug) printf("load back-1 vsm[1]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, 1, c_sm, f_sm)] = 0.0;
-//       }
-//     }
-//   }
-
-//   // load dist/ratio using f_sm for better performance
-//   // assumption F >= R
-//   if (r_sm == 0 && c_sm == 0 && f_sm < actual_R) {
-//     if (blockId * R * 2 + f_sm < nr) {
-//       dist_r_sm[2 + f_sm] = ddist_r[blockId * R * 2 + f_sm];
-//       // if (debug2 ) printf("load dist 1 [%d]: %f [%d]\n", 2 + f_sm,
-//       // dist_r_sm[2 + f_sm], blockId * R * 2 + f_sm);
-//       ratio_r_sm[2 + f_sm] = dratio_r[blockId * R * 2 + f_sm];
-//       // if (debug2 )printf("load ratio 1 [%d]: %f [%d]\n", 2 + f_sm,
-//       // ratio_r_sm[2 + f_sm], blockId * R * 2 + f_sm);
-//     } else {
-//       dist_r_sm[2 + f_sm] = 0.0;
-//       ratio_r_sm[2 + f_sm] = 0.0;
-//     }
-//     if (blockId * R * 2 + actual_R + f_sm < nr) {
-//       dist_r_sm[2 + actual_R + f_sm] =
-//           ddist_r[blockId * R * 2 + actual_R + f_sm];
-//       // if (debug2 )printf("load dist 2 [%d]: %f [%d]\n", 2 + actual_R +
-//       f_sm,
-//       // dist_r_sm[2 + actual_R + f_sm], blockId * R * 2 + actual_R + f_sm);
-//       ratio_r_sm[2 + actual_R + f_sm] =
-//           dratio_r[blockId * R * 2 + actual_R + f_sm];
-//       // if (debug2 )printf("load ratio 2 [%d]: %f [%d]\n", 2 + actual_R +
-//       f_sm,
-//       // ratio_r_sm[2 + actual_R + f_sm], blockId * R * 2 + actual_R + f_sm);
-//     } else {
-//       dist_r_sm[2 + actual_R + f_sm] = 0.0;
-//       ratio_r_sm[2 + actual_R + f_sm] = 0.0;
-//     }
-//   }
-
-//   if (blockId > 0) {
-//     if (f_sm < 2) {
-//       dist_r_sm[f_sm] = ddist_r[blockId * R * 2 - 2 + f_sm];
-//       // if (debug2 )printf("load dist -1 [%d]: %f [%d]\n", f_sm,
-//       // dist_r_sm[f_sm], blockId * R * 2 - 2 + f_sm);
-//       ratio_r_sm[f_sm] = dratio_r[blockId * R * 2 - 2 + f_sm];
-//       // if (debug2 )printf("load ratio -1 [%d]: %f [%d]\n", f_sm,
-//       // ratio_r_sm[f_sm], blockId * R * 2 - 2 + f_sm);
-//     }
-//   } else {
-//     if (f_sm < 2) {
-//       dist_r_sm[f_sm] = 0.0;
-//       ratio_r_sm[f_sm] = 0.0;
-//     }
-//   }
-
-//   __syncthreads();
-
-//   int adjusted_nr_c = nr_c;
-//   if (r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//     T h1 = dist_r_sm[r_sm * 2];
-//     T h2 = dist_r_sm[r_sm * 2 + 1];
-//     T h3 = dist_r_sm[r_sm * 2 + 2];
-//     T h4 = dist_r_sm[r_sm * 2 + 3];
-//     T r1 = ratio_r_sm[r_sm * 2];
-//     T r2 = ratio_r_sm[r_sm * 2 + 1];
-//     T r3 = ratio_r_sm[r_sm * 2 + 2];
-//     T r4 = 1 - r3;
-//     T a = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2, c_sm, f_sm)];
-//     T b = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 1, c_sm, f_sm)];
-//     T c = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 2, c_sm, f_sm)];
-//     T d = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)];
-//     T e = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 4, c_sm, f_sm)];
-
-//     // __syncthreads();
-//     // if (debug) {
-//     //   printf("r_sm(%d) %f %f %f %f %f\n",r_sm, a,b,c,d,e);
-//     //   printf("r_sm_h(%d) %f %f %f %f\n",r_sm, h1,h2,h3,h4);
-//     //   printf("r_sm_r(%d) %f %f %f %f\n",r_sm, r1,r2,r3,r4);
-//     // }
-//     // __syncthreads();
-
-//     // T tb = a * h1 + b * 2 * (h1+h2) + c * h2;
-//     // T tc = b * h2 + c * 2 * (h2+h3) + d * h3;
-//     // T td = c * h3 + d * 2 * (h3+h4) + e * h4;
-
-//     // if (debug) printf("f_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc,
-//     // td, tc+tb * r1 + td * r4);
-
-//     // tc += tb * r1 + td * r4;
-
-//     // if (debug) {
-//     //   printf("f_sm(%d) %f %f %f %f %f f_sm_h %f %f %f %f f_sm_r %f %f %f
-//     %f, out: %f\n",f_sm, a,b,c,d,e, h1,h2,h3,h4,r1,r2,r3,r4, mass_trans(a, b,
-//     c, d, e, h1, h2, h3, h4, r1, r2, r3, r4));
-//     // }
-
-//     dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] =
-//         mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4);
-
-//     // if (debug) printf("store[%d %d %d] %f (%f)\n", r_gl, c_gl, f_gl,
-//     // mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4),
-//     //                 mass_trans(a, b, c, (T)0.0, (T)0.0, h1, (T)0.0,
-//     (T)0.0,
-//     //                 h4, r1, r2, (T)0.0, (T)0.0));
-//     // // printf("%d %d %d\n", r_gl, c_gl, f_gl);
-//     // if (blockId * R + R == nr-1) {
-//     // if (r_gl+1 == nr_c - 1) {
-//     // if (r_gl+1 == nr_c - 1) {
-//     //   // T te = h4 * d + 2 * h4 * e;
-//     //   // te += td * r3;
-//     //   dw[get_idx(lddw1, lddw2, blockId * R + actual_R, c_gl, f_gl)] =
-//     //     mass_trans(c, d, e, (T)0.0, (T)0.0,
-//     //       h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0);
-
-//     //   if (debug) printf("store-last[%d %d %d] %f\n", blockId * R +
-//     actual_R,
-//     //   c_gl, f_gl,
-//     //             mass_trans(c, d, e, (T)0.0, (T)0.0,
-//     //       h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0));
-//     // }
-//     //}
-//   }
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// void lpk_reo_3_adaptive_launcher(Handle<D, T> &handle, SIZE *shape_h,
-//                                  SIZE *shape_c_h, SIZE *shape_d, SIZE
-//                                  *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM
-//                                  processed_n, DIM *processed_dims_h, DIM
-//                                  *processed_dims_d, DIM curr_dim_r, DIM
-//                                  curr_dim_c, DIM curr_dim_f, T *ddist_r, T
-//                                  *dratio_r, T *dv1, LENGTH lddv11, LENGTH
-//                                  lddv12, T *dv2, LENGTH lddv21, LENGTH
-//                                  lddv22, T *dw, LENGTH lddw1, LENGTH lddw2,
-//                                  int queue_idx) {
-
-//   SIZE nr = shape_h[curr_dim_r];
-//   SIZE nc = shape_h[curr_dim_c];
-//   SIZE nf = shape_h[curr_dim_f];
-//   SIZE nr_c = shape_c_h[curr_dim_r];
-//   SIZE nc_c = shape_c_h[curr_dim_c];
-//   SIZE nf_c = shape_c_h[curr_dim_f];
-
-//   SIZE total_thread_z = nr_c;
-//   // if (nr_c % 2 == 1){ total_thread_z = nr_c - 1; }
-//   // else { total_thread_z = nr_c; }
-//   SIZE total_thread_y = nc_c;
-//   SIZE total_thread_x = nf_c;
-
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = ((R * 2 + 3) * C * F + (R * 2 + 3) * 2) * sizeof(T);
-//   sm_size += (D * 4) * sizeof(SIZE);
-//   sm_size += (D * 1) * sizeof(DIM);
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   for (DIM d = 0; d < D; d++) {
-//     if (d != curr_dim_f && d != curr_dim_c && d != curr_dim_r) {
-//       SIZE t = shape_h[d];
-//       for (DIM k = 0; k < processed_n; k++) {
-//         if (d == processed_dims_h[k]) {
-//           t = shape_c_h[d];
-//         }
-//       }
-//       gridx *= t;
-//     }
-//   }
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-//   // printf("lpk_reo_3 exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz,
-//   // gridx, gridy, gridz);
-
-//   _lpk_reo_3<D, T, R, C, F><<<blockPerGrid, threadsPerBlock, sm_size,
-//                               *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       shape_d, shape_c_d, ldvs, ldws, processed_n, processed_dims_d,
-//       curr_dim_r, curr_dim_c, curr_dim_f, ddist_r, dratio_r, dv1, lddv11,
-//       lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void lpk_reo_3(Handle<D, T> &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE
-// *shape_d,
-//                SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n,
-//                DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r,
-//                DIM curr_dim_c, DIM curr_dim_f, T *ddist_r, T *dratio_r, T
-//                *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2, LENGTH lddv21,
-//                LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, int
-//                queue_idx, int config) {
-
-//   #define LPK(R, C, F)                                                           \
-//   {                                                                            \
-//     lpk_reo_3_adaptive_launcher<D, T, R, C, F>(                                \
-//         handle, shape_h, shape_c_h, shape_d, shape_c_d, ldvs, ldws,            \
-//         processed_n, processed_dims_h, processed_dims_d,\ 
-//                                curr_dim_r,                                     \
-//         curr_dim_c, curr_dim_f, ddist_r, dratio_r, dv1, lddv11, lddv12, dv2,   \
-//         lddv21, lddv22, dw, lddw1, lddw2, queue_idx);                          \
-//   }
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D >= 3) {
-//     if (profile || config == 6) {
-//       LPK(2, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(2, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(2, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(4, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(8, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(2, 2, 2)
-//     }
-//   } else {
-//     printf("Error: mass_trans_multiply_3_cpt is only for 3D data\n");
-//   }
-
-//   #undef LPK
-// }
-
 } // namespace mgard_x
 
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.h
deleted file mode 100644
index 922eecbaf1..0000000000
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_LINEAR_PROCESSING_KERNEL_3D
-#define MGARD_X_LINEAR_PROCESSING_KERNEL_3D
-
-#include "../../Common.h"
-
-namespace mgard_x {
-
-template <DIM D, typename T>
-void lpk_reo_1_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf, SIZE nf_c,
-                  SIZE zero_r, SIZE zero_c, SIZE zero_f, T *ddist_f,
-                  T *dratio_f, T *dv1, SIZE lddv11, SIZE lddv12, T *dv2,
-                  SIZE lddv21, SIZE lddv22, T *dw, SIZE lddw1, SIZE lddw2,
-                  int queue_idx, int config);
-
-template <DIM D, typename T>
-void lpk_reo_2_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf_c, SIZE nc_c,
-                  T *ddist_c, T *dratio_c, T *dv1, SIZE lddv11, SIZE lddv12,
-                  T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE lddw1,
-                  SIZE lddw2, int queue_idx, int config);
-
-template <DIM D, typename T>
-void lpk_reo_3_3d(Handle<D, T> &handle, SIZE nr, SIZE nc_c, SIZE nf_c,
-                  SIZE nr_c, T *ddist_r, T *dratio_r, T *dv1, SIZE lddv11,
-                  SIZE lddv12, T *dv2, SIZE lddv21, SIZE lddv22, T *dw,
-                  SIZE lddw1, SIZE lddw2, int queue_idx, int config);
-
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.hpp
index 0b043fb729..e757c85457 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.hpp
+++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.hpp
@@ -435,14 +435,11 @@ class Lpk1Reo3D : public AutoTuner<DeviceType> {
                SubArray<D, T, DeviceType> dv1, SubArray<D, T, DeviceType> dv2,
                SubArray<D, T, DeviceType> dw, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(nf) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_mr1[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.lpk1_3d[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define LPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -455,22 +452,26 @@ class Lpk1Reo3D : public AutoTuner<DeviceType> {
         GenTask<R, C, F>(nr, nc, nf, nf_c, zero_r, zero_c, zero_f, ddist_f,    \
                          dratio_f, dv1, dv2, dw, queue_idx);                   \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    LPK(0)
-    LPK(1)
-    LPK(2)
-    LPK(3)
-    LPK(4)
-    LPK(5)
-    LPK(6)
+    LPK(6) if (!ret.success) config--;
+    LPK(5) if (!ret.success) config--;
+    LPK(4) if (!ret.success) config--;
+    LPK(3) if (!ret.success) config--;
+    LPK(2) if (!ret.success) config--;
+    LPK(1) if (!ret.success) config--;
+    LPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Lpk1Reo3D.\n";
+      exit(-1);
+    }
 #undef LPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -787,14 +788,11 @@ class Lpk2Reo3D : public AutoTuner<DeviceType> {
                SubArray<D, T, DeviceType> dv1, SubArray<D, T, DeviceType> dv2,
                SubArray<D, T, DeviceType> dw, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(nf_c) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_mr2[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.lpk2_3d[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define LPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -806,22 +804,26 @@ class Lpk2Reo3D : public AutoTuner<DeviceType> {
     TaskType task = GenTask<R, C, F>(nr, nc, nf_c, nc_c, ddist_c, dratio_c,    \
                                      dv1, dv2, dw, queue_idx);                 \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    LPK(0)
-    LPK(1)
-    LPK(2)
-    LPK(3)
-    LPK(4)
-    LPK(5)
-    LPK(6)
+    LPK(6) if (!ret.success) config--;
+    LPK(5) if (!ret.success) config--;
+    LPK(4) if (!ret.success) config--;
+    LPK(3) if (!ret.success) config--;
+    LPK(2) if (!ret.success) config--;
+    LPK(1) if (!ret.success) config--;
+    LPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Lpk2Reo3D.\n";
+      exit(-1);
+    }
 #undef LPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -1156,14 +1158,11 @@ class Lpk3Reo3D : public AutoTuner<DeviceType> {
                SubArray<D, T, DeviceType> dv1, SubArray<D, T, DeviceType> dv2,
                SubArray<D, T, DeviceType> dw, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(nf_c) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
-    // int config =
-    // AutoTuner<DeviceType>::autoTuningTable.auto_tuning_mr3[arch][prec][range_l];
     int config = AutoTuner<DeviceType>::autoTuningTable.lpk3_3d[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define LPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -1175,22 +1174,26 @@ class Lpk3Reo3D : public AutoTuner<DeviceType> {
     TaskType task = GenTask<R, C, F>(nr, nc_c, nf_c, nr_c, ddist_r, dratio_r,  \
                                      dv1, dv2, dw, queue_idx);                 \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    LPK(0)
-    LPK(1)
-    LPK(2)
-    LPK(3)
-    LPK(4)
-    LPK(5)
-    LPK(6)
+    LPK(6) if (!ret.success) config--;
+    LPK(5) if (!ret.success) config--;
+    LPK(4) if (!ret.success) config--;
+    LPK(3) if (!ret.success) config--;
+    LPK(2) if (!ret.success) config--;
+    LPK(1) if (!ret.success) config--;
+    LPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for Lpk3Reo3D.\n";
+      exit(-1);
+    }
 #undef LPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
@@ -1199,1130 +1202,6 @@ class Lpk3Reo3D : public AutoTuner<DeviceType> {
   }
 };
 
-// template <typename T, SIZE R, SIZE C, SIZE F>
-// __global__ void _lpk_reo_1_3d(SIZE nr, SIZE nc, SIZE nf, SIZE nf_c, SIZE
-// zero_r,
-//                               SIZE zero_c, SIZE zero_f, T *ddist_f, T
-//                               *dratio_f, T *dv1, SIZE lddv11, SIZE lddv12, T
-//                               *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE
-//                               lddw1, SIZE lddw2) {
-
-//   // bool debug = false;
-//   // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 1 &&
-//   // threadIdx.y == 0 && threadIdx.z == 0 ) debug = false;
-
-//   // bool debug2 = false;
-//   // if (blockIdx.z == gridDim.z-1 && blockIdx.y == 1 && blockIdx.x == 16)
-//   // debug2 = false;
-
-//   bool PADDING = (nf % 2 == 0);
-
-//   T *sm = SharedMemory<T>();
-//   // extern __shared__ double sm[]; // size: (blockDim.x + 1) * (blockDim.y +
-//   1)
-//   // * (blockDim.z + 1)
-//   SIZE ldsm1 = F * 2 + 3;
-//   SIZE ldsm2 = C;
-//   T *v_sm = sm;
-//   T *dist_f_sm = sm + ldsm1 * ldsm2 * R;
-//   T *ratio_f_sm = dist_f_sm + ldsm1;
-
-//   // bool debug = false;
-//   // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0 &&
-//   // threadIdx.z == 0 && threadIdx.y == 0 ) debug = true;
-
-//   SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z;
-//   SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y;
-//   SIZE f_gl = blockIdx.x * blockDim.x + threadIdx.x;
-
-//   SIZE blockId = blockIdx.x;
-
-//   SIZE r_sm = threadIdx.z;
-//   SIZE c_sm = threadIdx.y;
-//   SIZE f_sm = threadIdx.x;
-
-//   SIZE actual_F = F;
-//   if (nf_c - blockId * blockDim.x < F) {
-//     actual_F = nf_c - blockId * blockDim.x;
-//   }
-
-//   // if (nf_c % 2 == 1){
-//   //   if(nf_c-1 - blockId * blockDim.x < F) { actual_F = nf_c - 1 - blockId
-//   *
-//   //   blockDim.x; }
-//   // } else {
-//   //   if(nf_c - blockId * blockDim.x < F) { actual_F = nf_c - blockId *
-//   //   blockDim.x; }
-//   // }
-
-//   // if (debug) printf("actual_F %d\n", actual_F);
-
-//   if (r_gl < nr && c_gl < nc && f_gl < nf_c) {
-//     if (r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) {
-//       // if (debug) printf("load left vsm[%d]: 0.0\n", f_sm * 2 + 2);
-//       v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)] = 0.0;
-//     } else {
-//       // if (debug) printf("load left vsm[%d]<-dv1[%d, %d, %d]: %f\n", f_sm *
-//       2
-//       // + 2, r_gl, c_gl, f_gl, dv1[get_idx(lddv11, lddv12, r_gl, c_gl,
-//       f_gl)]); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)] =
-//           dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)];
-//     }
-
-//     if (f_sm == actual_F - 1) {
-//       if (r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) {
-//         // if (debug) printf("load left+1 vsm[%d]: 0.0\n", actual_F * 2 + 2);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] = 0.0;
-//       } else {
-//         if (f_gl + 1 < nf_c) {
-//           // if (debug) printf("load left+1 vsm[%d]: %f\n", actual_F * 2 + 2,
-//           // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl + 1)]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] =
-//               dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl + 1)];
-//         } else {
-//           // if (debug) printf("load left+1 vsm[%d]: 0.0\n", actual_F * 2 +
-//           2); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] =
-//           0.0;
-//         }
-//       }
-//     }
-
-//     if (f_sm == 0) {
-//       // left
-//       if (r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) {
-//         // coarse (-1)
-//         // if (debug) printf("load left-1 vsm[0]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = 0.0;
-//       } else {
-//         if (f_gl >= 1) {
-//           // other (-1)
-//           // if (debug) printf("load left-1 vsm[0]: %f\n",
-//           dv1[get_idx(lddv11,
-//           // lddv12, r_gl, c_gl, f_gl-1)]);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] =
-//               dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl - 1)];
-//         } else {
-//           // other (-1)
-//           // if (debug) printf("load left-1 vsm[0]: 0.0\n");
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = 0.0;
-//         }
-//       }
-//     }
-
-//     // right
-//     if (!PADDING) {
-//       if (nf_c % 2 != 0) {
-//         if (f_gl >= 1 && f_gl < nf_c ) {
-//           // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm *
-//           2
-//           // + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)], r_gl,
-//           // c_gl, f_gl - 1);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] =
-//               dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)];
-//         } else {
-//           // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 1);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = 0.0;
-//         }
-//       } else { // nf_c % 2 == 0
-//         if (f_gl < nf_c - 1) {
-//           // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm *
-//           2
-//           // + 3, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//           // f_gl);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] =
-//               dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//         } else {
-//           // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 3);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = 0.0;
-//         }
-//       }
-//     } else { // PADDING
-//       if (nf_c % 2 != 0) {
-//         if (f_gl >= 1 && f_gl < nf_c - 1) {
-//           // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm *
-//           2
-//           // + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)], r_gl,
-//           // c_gl, f_gl - 1);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] =
-//               dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)];
-//         } else {
-//           // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 1);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = 0.0;
-//         }
-//       } else { // nf_c % 2 == 0
-//         if (f_gl < nf_c - 2) {
-//           // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm *
-//           2
-//           // + 3, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//           // f_gl);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] =
-//               dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//         } else {
-//           // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 3);
-//           v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = 0.0;
-//         }
-//       }
-//     }
-
-//     if (f_sm == actual_F - 1) {
-//       // right (+1)
-//       if (!PADDING) {
-//         if (nf_c % 2 != 0) {
-//           if (f_gl < nf_c - 1) {
-//             // if (debug) printf("load right+1 vsm[%d]: %f <- %d %d %d\n",
-//             // actual_F * 2 + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl,
-//             f_gl)],
-//             // r_gl, c_gl, f_gl);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] =
-//                 dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//           } else {
-//             // if (debug) printf("load right+1 vsm[%d]: 0.0\n", actual_F * 2
-//             +
-//             // 1);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = 0.0;
-//           }
-//         } else { // nf_c % 2 == 0
-//           if (f_gl >= actual_F) {
-//             // if (debug) printf("load right-1 vsm[1]: %f <- %d %d %d\n",
-//             // dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)],
-//             r_gl,
-//             // c_gl, f_gl - actual_F);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] =
-//                 dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)];
-//           } else {
-//             // if (debug) printf("load right-1 vsm[1]: 0.0\n");
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = 0.0;
-//           }
-//         }
-//       } else {
-//         if (nf_c % 2 != 0) {
-//           if (f_gl < nf_c - 2) {
-//             // if (debug) printf("actual_F(%d), load right+1 vsm[%d]: %f <-
-//             %d
-//             // %d %d\n", actual_F, actual_F * 2 + 1, dv2[get_idx(lddv21,
-//             lddv22,
-//             // r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] =
-//                 dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//           } else {
-//             // if (debug) printf("load right+1 vsm[%d]: 0.0\n", actual_F * 2
-//             +
-//             // 1);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = 0.0;
-//           }
-//         } else { // nf_c % 2 == 0
-//           if (f_gl >= actual_F && f_gl - actual_F < nf_c - 2) {
-//             // if (debug) printf("load right-1 vsm[1]: %f <- %d %d %d\n",
-//             // dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)],
-//             r_gl,
-//             // c_gl, f_gl - actual_F);
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] =
-//                 dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)];
-//           } else {
-//             // if (debug) printf("load right-1 vsm[1]: 0.0\n");
-//             v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = 0.0;
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   bool debug = false;
-//   // if (r_gl == 0 && c_gl == 0) debug = true;
-
-//   if (r_sm == 0 && c_sm == 0 && f_sm < actual_F) {
-//     if (blockId * F * 2 + f_sm < nf) {
-//       dist_f_sm[2 + f_sm] = ddist_f[blockId * F * 2 + f_sm];
-//       ratio_f_sm[2 + f_sm] = dratio_f[blockId * F * 2 + f_sm];
-//       if (debug) printf("load dist[%d] -> sm[%d]: %f\n", blockId * F * 2 +
-//       f_sm, 2 + f_sm, ddist_f[blockId * F * 2 + f_sm]);
-//     } else {
-//       dist_f_sm[2 + f_sm] = 0.0;
-//       ratio_f_sm[2 + f_sm] = 0.0;
-//     }
-
-//     if (blockId * F * 2 + actual_F + f_sm < nf) {
-//       dist_f_sm[2 + actual_F + f_sm] =
-//           ddist_f[blockId * F * 2 + actual_F + f_sm];
-//       ratio_f_sm[2 + actual_F + f_sm] =
-//           dratio_f[blockId * F * 2 + actual_F + f_sm];
-//       if (debug) printf("load dist[%d] -> sm[%d]: %f\n", blockId * F * 2 +
-//       actual_F + f_sm, 2 + actual_F + f_sm, ddist_f[blockId * F * 2 +
-//       actual_F + f_sm]);
-//     } else {
-//       dist_f_sm[2 + actual_F + f_sm] = 0.0;
-//       ratio_f_sm[2 + actual_F + f_sm] = 0.0;
-//     }
-//     // dist_f_sm[2 + f_sm] = ddist_f[f_gl];
-//     // dist_f_sm[2 + actual_F + f_sm] = ddist_f[actual_F + f_gl];
-//     // ratio_f_sm[2 + f_sm] = dratio_f[f_gl];
-//     // ratio_f_sm[2 + actual_F + f_sm] = dratio_f[actual_F + f_gl];
-//   }
-
-//   if (blockId > 0) {
-//     if (f_sm < 2) {
-//       // dist_f_sm[f_sm] = ddist_f[f_gl - 2];
-//       // ratio_f_sm[f_sm] = dratio_f[f_gl - 2];
-//       dist_f_sm[f_sm] = ddist_f[blockId * F * 2 + f_sm - 2];
-//       ratio_f_sm[f_sm] = dratio_f[blockId * F * 2 + f_sm - 2];
-//     }
-//   } else {
-//     if (f_sm < 2) {
-//       dist_f_sm[f_sm] = 0.0;
-//       ratio_f_sm[f_sm] = 0.0;
-//     }
-//   }
-
-//   __syncthreads();
-
-//   if (r_gl < nr && c_gl < nc && f_gl < nf_c) {
-//     T h1 = dist_f_sm[f_sm * 2];
-//     T h2 = dist_f_sm[f_sm * 2 + 1];
-//     T h3 = dist_f_sm[f_sm * 2 + 2];
-//     T h4 = dist_f_sm[f_sm * 2 + 3];
-//     T r1 = ratio_f_sm[f_sm * 2];
-//     T r2 = ratio_f_sm[f_sm * 2 + 1];
-//     T r3 = ratio_f_sm[f_sm * 2 + 2];
-//     T r4 = 1 - r3;
-//     T a = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2)];
-//     T b = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)];
-//     T c = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)];
-//     T d = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)];
-//     T e = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 4)];
-
-//     // if (f_gl == nf_c - 1) {
-//     //   printf("f_sm(%d) %f %f %f %f %f\n",f_sm, a,b,c,d,e);
-//     //   printf("f_sm_h(%d) %f %f %f %f\n",f_sm, h1,h2,h3,h4);
-//     //   printf("f_sm_r(%d) %f %f %f %f\n",f_sm, r1,r2,r3,r4);
-//     // }
-
-//     // T tb = a * h1 + b * 2 * (h1+h2) + c * h2;
-//     // T tc = b * h2 + c * 2 * (h2+h3) + d * h3;
-//     // T td = c * h3 + d * 2 * (h3+h4) + e * h4;
-
-//     // if (debug) printf("f_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc,
-//     // td, tc+tb * r1 + td * r4);
-
-//     // tc += tb * r1 + td * r4;
-
-//     dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] =
-//         mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4);
-
-//     // if (debug) printf("store[%d %d %d] %f \n", r_gl, c_gl, f_gl,
-//     //           mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4));
-
-//     // printf("test block %d F %d nf %d\n", blockId, F, nf);
-//     // if (f_gl+1 == nf_c-1) {
-
-//     //     // T te = h4 * d + 2 * h4 * e;
-//     //     //printf("f_sm(%d) mm-e: %f\n", f_sm, te);
-//     //     // te += td * r3;
-//     //     dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl+1)] =
-//     //       mass_trans(c, d, e, (T)0.0, (T)0.0, h1, h2, (T)0.0, (T)0.0, r1,
-//     r2,
-//     //       (T)0.0, (T)0.0);
-//     // }
-//   }
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// void lpk_reo_1_3d_adaptive_launcher(Handle<D, T> &handle, SIZE nr, SIZE nc,
-//                                     SIZE nf, SIZE nf_c, SIZE zero_r, SIZE
-//                                     zero_c, SIZE zero_f, T *ddist_f, T
-//                                     *dratio_f, T *dv1, SIZE lddv11, SIZE
-//                                     lddv12, T *dv2, SIZE lddv21, SIZE lddv22,
-//                                     T *dw, SIZE lddw1, SIZE lddw2, int
-//                                     queue_idx) {
-//   // printf("dratio_f: ");
-//   // print_matrix_cuda(1, (nf-1)*2, dratio_f, (nf-1)*2);
-//   SIZE total_thread_z = nr;
-//   SIZE total_thread_y = nc;
-//   SIZE total_thread_x = nf_c;
-//   // if (nf_c % 2 == 1) { total_thread_x = nf_c - 1; }
-//   // else { total_thread_x = nf; }
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = (R * C * (F * 2 + 3) + (F * 2 + 3) * 2) * sizeof(T);
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-
-//   // printf("nr: %d nc: %d, nf: %d, nf_c: %d\n", nr, nc, nf, nf_c);
-//   // printf("tb: %d %d %d, grid: %d %d %d\n", tbx, tby, tbz, gridx, gridy,
-//   // gridz);
-
-//   _lpk_reo_1_3d<T, R, C, F><<<blockPerGrid, threadsPerBlock, sm_size,
-//                               *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       nr, nc, nf, nf_c, zero_r, zero_c, zero_f, ddist_f, dratio_f, dv1,
-//       lddv11, lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void lpk_reo_1_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf, SIZE nf_c,
-//                   SIZE zero_r, SIZE zero_c, SIZE zero_f, T *ddist_f, T
-//                   *dratio_f, T *dv1, SIZE lddv11, SIZE lddv12, T *dv2, SIZE
-//                   lddv21, SIZE lddv22, T *dw, SIZE lddw1, SIZE lddw2, int
-//                   queue_idx, int config) {
-
-//   #define LPK(R, C, F)                                                           \
-//   {                                                                            \
-//     lpk_reo_1_3d_adaptive_launcher<D, T, R, C, F>(                             \
-//         handle, nr, nc, nf, nf_c, zero_r, zero_c, zero_f, ddist_f, dratio_f,   \
-//         dv1, lddv11, lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2,            \
-//         queue_idx);                                                            \
-//   }
-
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D == 3) {
-//     if (profile || config == 6) {
-//       LPK(2, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(2, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(2, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(4, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(8, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(2, 2, 2)
-//     }
-//   } else if (D == 2) {
-//     if (profile || config == 6) {
-//       LPK(1, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(1, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(1, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(1, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(1, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(1, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(1, 2, 4)
-//     }
-//   } else if (D == 1) {
-//     if (profile || config == 6) {
-//       LPK(1, 1, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(1, 1, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(1, 1, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(1, 1, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(1, 1, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(1, 1, 8)
-//     }
-//     if (profile || config == 0) {
-//       LPK(1, 1, 8)
-//     }
-//   }
-//   #undef LPK
-// }
-
-// template <typename T, SIZE R, SIZE C, SIZE F>
-// __global__ void _lpk_reo_2_3d(SIZE nr, SIZE nc, SIZE nf_c, SIZE nc_c, T
-// *ddist_c,
-//                               T *dratio_c, T *dv1, SIZE lddv11, SIZE lddv12,
-//                               T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE
-//                               lddw1, SIZE lddw2) {
-
-//   // bool debug = false;
-//   // if (blockIdx.y == gridDim.y-1 && blockIdx.x == 0 &&
-//   // threadIdx.x == 0 ) debug = false;
-
-//   // bool debug2 = false;
-//   // if (blockIdx.z == gridDim.z-1 && blockIdx.y == 1 && blockIdx.x == 16)
-//   // debug2 = false;
-
-//   bool PADDING = (nc % 2 == 0);
-
-//   T *sm = SharedMemory<T>();
-
-//   // extern __shared__ double sm[]; // size: (blockDim.x + 1) * (blockDim.y +
-//   1)
-//   // * (blockDim.z + 1)
-//   SIZE ldsm1 = F;
-//   SIZE ldsm2 = C * 2 + 3;
-//   T *v_sm = sm;
-//   T *dist_c_sm = sm + ldsm1 * ldsm2 * R;
-//   T *ratio_c_sm = dist_c_sm + ldsm2;
-
-//   // bool debug = false;
-//   // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0 &&
-//   // threadIdx.z == 0 && threadIdx.x == 0 ) debug = false;
-
-//   SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z;
-//   SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y;
-//   SIZE f_gl = blockIdx.x * blockDim.x + threadIdx.x;
-
-//   SIZE blockId = blockIdx.y;
-
-//   SIZE r_sm = threadIdx.z;
-//   SIZE c_sm = threadIdx.y;
-//   SIZE f_sm = threadIdx.x;
-
-//   SIZE actual_C = C;
-//   if (nc_c - blockIdx.y * blockDim.y < C) {
-//     actual_C = nc_c - blockIdx.y * blockDim.y;
-//   }
-
-//   // if (nc_c % 2 == 1){
-//   //   if(nc_c-1 - blockIdx.y * blockDim.y < C) { actual_C = nc_c - 1 -
-//   //   blockIdx.y * blockDim.y; }
-//   // } else {
-//   //   if(nc_c - blockIdx.y * blockDim.y < C) { actual_C = nc_c - blockIdx.y
-//   *
-//   //   blockDim.y; }
-//   // }
-
-//   // if (debug) printf("actual_C %d\n", actual_C);
-
-//   if (r_gl < nr && c_gl < nc_c && f_gl < nf_c) {
-//     // if (debug) printf("load up vsm[%d]: %f <- %d %d %d\n", c_sm * 2 + 2,
-//     // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl);
-//     v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 2, f_sm)] =
-//         dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)];
-
-//     if (c_sm == actual_C - 1) {
-//       if (c_gl + 1 < nc_c) {
-//         // if (debug) printf("load up+1 vsm[%d]: %f <- %d %d %d\n", actual_C
-//         * 2
-//         // + 2, dv1[get_idx(lddv11, lddv12, r_gl, blockId * C + actual_C,
-//         // f_gl)], r_gl, blockId * C + actual_C, f_gl);
-//         // c_gl+1 == blockId * C + C
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, actual_C * 2 + 2, f_sm)] =
-//             dv1[get_idx(lddv11, lddv12, r_gl, c_gl + 1, f_gl)];
-//       } else {
-//         // if (debug) printf("load up+1 vsm[%d]: 0.0\n", actual_C * 2 + 2);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, actual_C * 2 + 2, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (c_sm == 0) {
-//       if (c_gl >= 1) {
-//         // if (debug) printf("load up-1 vsm[0]: %f <- %d %d %d\n",
-//         // dv1[get_idx(lddv11, lddv12, r_gl, c_gl-1, f_gl)], r_gl, c_gl-1,
-//         // f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] =
-//             dv1[get_idx(lddv11, lddv12, r_gl, c_gl - 1, f_gl)];
-//       } else {
-//         // if (debug) printf("load up-1 vsm[0]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (!PADDING) {
-//       if (c_gl < nc_c - 1) {
-//         // if (debug) printf("load down vsm[%d]: %f <- %d %d %d\n", c_sm * 2
-//         +
-//         // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] =
-//             dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load down vsm[%d]: 0.0\n", c_sm * 2 + 3);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = 0.0;
-//       }
-//     } else {
-//       if (c_gl < nc_c - 2) {
-//         // if (debug) printf("load down vsm[%d]: %f <- %d %d %d\n", c_sm * 2
-//         +
-//         // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] =
-//             dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load down vsm[%d]: 0.0\n", c_sm * 2 + 3);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (c_gl >= 1 &&
-//         (PADDING && c_gl - 1 < nc_c - 2 || !PADDING && c_gl - 1 < nc_c - 1))
-//         {
-//       if (c_sm == 0) {
-//         // if (debug) printf("PADDING: %d, c_gl-1: %d nc_c-2: %d\n", PADDING,
-//         // c_gl-1, nc_c - 2); if (debug) printf("load down-1 vsm[1]: %f <- %d
-//         %d
-//         // %d\n", dv2[get_idx(lddv11, lddv12, r_gl, c_gl-1, f_gl)], r_gl,
-//         // c_gl-1, f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, 1, f_sm)] =
-//             dv2[get_idx(lddv11, lddv12, r_gl, c_gl - 1, f_gl)];
-//       }
-//     } else {
-//       if (c_sm == 0) {
-//         // if (debug) printf("load down-1 vsm[1]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm, 1, f_sm)] = 0.0;
-//       }
-//     }
-
-//   }
-
-//   // load dist/ratio using f_sm for better performance
-//   // assumption F >= C
-//   if (r_sm == 0 && c_sm == 0 && f_sm < actual_C) {
-//     if (blockId * C * 2 + f_sm < nc) {
-//       dist_c_sm[2 + f_sm] = ddist_c[blockId * C * 2 + f_sm];
-//       ratio_c_sm[2 + f_sm] = dratio_c[blockId * C * 2 + f_sm];
-//     } else {
-//       dist_c_sm[2 + f_sm] = 0.0;
-//       ratio_c_sm[2 + f_sm] = 0.0;
-//     }
-
-//     if (blockId * C * 2 + actual_C + f_sm < nc) {
-//       dist_c_sm[2 + actual_C + f_sm] =
-//           ddist_c[blockId * C * 2 + actual_C + f_sm];
-//       ratio_c_sm[2 + actual_C + f_sm] =
-//           dratio_c[blockId * C * 2 + actual_C + f_sm];
-//     } else {
-//       dist_c_sm[2 + actual_C + f_sm] = 0.0;
-//       ratio_c_sm[2 + actual_C + f_sm] = 0.0;
-//     }
-//   }
-
-//   if (blockId > 0) {
-//     if (f_sm < 2) {
-//       dist_c_sm[f_sm] = ddist_c[blockId * C * 2 - 2 + f_sm];
-//       ratio_c_sm[f_sm] = dratio_c[blockId * C * 2 - 2 + f_sm];
-//     }
-//   } else {
-//     if (f_sm < 2) {
-//       dist_c_sm[f_sm] = 0.0;
-//       ratio_c_sm[f_sm] = 0.0;
-//     }
-//   }
-
-//   __syncthreads();
-
-//   if (r_gl < nr && c_gl < nc_c && f_gl < nf_c) {
-//     T h1 = dist_c_sm[c_sm * 2];
-//     T h2 = dist_c_sm[c_sm * 2 + 1];
-//     T h3 = dist_c_sm[c_sm * 2 + 2];
-//     T h4 = dist_c_sm[c_sm * 2 + 3];
-//     T r1 = ratio_c_sm[c_sm * 2];
-//     T r2 = ratio_c_sm[c_sm * 2 + 1];
-//     T r3 = ratio_c_sm[c_sm * 2 + 2];
-//     T r4 = 1 - r3;
-//     T a = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2, f_sm)];
-//     T b = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 1, f_sm)];
-//     T c = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 2, f_sm)];
-//     T d = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)];
-//     T e = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 4, f_sm)];
-
-//     // if (debug) {
-//     //   printf("c_sm(%d) %f %f %f %f %f\n",c_sm, a,b,c,d,e);
-//     //   printf("c_sm_h(%d) %f %f %f %f\n",c_sm, h1,h2,h3,h4);
-//     //   printf("c_sm_r(%d) %f %f %f %f\n",c_sm, r1,r2,r3,r4);
-//     // }
-
-//     // T tb = a * h1 + b * 2 * (h1+h2) + c * h2;
-//     // T tc = b * h2 + c * 2 * (h2+h3) + d * h3;
-//     // T td = c * h3 + d * 2 * (h3+h4) + e * h4;
-
-//     // if (debug) printf("c_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc,
-//     // td, tc+tb * r1 + td * r4);
-
-//     // tc += tb * r1 + td * r4;
-
-//     // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) {
-//     //   printf("mr2(%d) mm2: %f -> (%d %d %d)\n", c_sm, tc, r_gl, c_gl,
-//     f_gl);
-//     //   // printf("f_sm(%d) b c d: %f %f %f\n", f_sm, tb, tc, td);
-//     // }
-
-//     dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] =
-//         mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4);
-
-//     // if (debug) printf("store[%d %d %d] %f \n", r_gl, c_gl, f_gl,
-//     //           mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4));
-
-//     // printf("%d %d %d\n", r_gl, c_gl, f_gl);
-//     // if (blockId * C + C == nc-1) {
-//     // if (c_gl + 1 == nc_c - 1) {
-//     //   // T te = h4 * d + 2 * h4 * e;
-//     //   // te += td * r3;
-//     //   dw[get_idx(lddw1, lddw2, r_gl, blockId * C + actual_C, f_gl)] =
-//     //     mass_trans(c, d, e, (T)0.0, (T)0.0,
-//     //       h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0);
-//     // }
-//     // }
-
-//   }
-
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// void lpk_reo_2_3d_adaptive_launcher(Handle<D, T> &handle, SIZE nr, SIZE nc,
-//                                     SIZE nf_c, SIZE nc_c, T *ddist_c, T
-//                                     *dratio_c, T *dv1, SIZE lddv11, SIZE
-//                                     lddv12, T *dv2, SIZE lddv21, SIZE lddv22,
-//                                     T *dw, SIZE lddw1, SIZE lddw2, int
-//                                     queue_idx) {
-//   cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
-//   cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
-//   SIZE total_thread_z = nr;
-//   SIZE total_thread_y = nc_c;
-//   SIZE total_thread_x = nf_c;
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = (R * (C * 2 + 3) * F + (C * 2 + 3) * 2) * sizeof(T);
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-
-//   // printf("nr: %d nc: %d, nf_c: %d, nc_c: %d\n", nr, nc, nf_c, nc_c);
-//   // printf("tb: %d %d %d, grid: %d %d %d\n", tbx, tby, tbz, gridx, gridy,
-//   // gridz);
-
-//   _lpk_reo_2_3d<T, R, C, F><<<blockPerGrid, threadsPerBlock, sm_size,
-//                               *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       nr, nc, nf_c, nc_c, ddist_c, dratio_c, dv1, lddv11, lddv12, dv2,
-//       lddv21, lddv22, dw, lddw1, lddw2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void lpk_reo_2_3d(Handle<D, T> &handle, SIZE nr, SIZE nc, SIZE nf_c, SIZE
-// nc_c,
-//                   T *ddist_c, T *dratio_c, T *dv1, SIZE lddv11, SIZE lddv12,
-//                   T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE lddw1, SIZE
-//                   lddw2, int queue_idx, int config) {
-
-//   #define LPK(R, C, F)                                                           \
-//   {                                                                            \
-//     lpk_reo_2_3d_adaptive_launcher<D, T, R, C, F>(                             \
-//         handle, nr, nc, nf_c, nc_c, ddist_c, dratio_c, dv1, lddv11, lddv12,    \
-//         dv2, lddv21, lddv22, dw, lddw1, lddw2, queue_idx);                     \
-//   }
-
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D == 3) {
-//     if (profile || config == 6) {
-//       LPK(2, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(2, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(2, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(4, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(8, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(2, 2, 2)
-//     }
-//   } else if (D == 2) {
-//     if (profile || config == 6) {
-//       LPK(1, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(1, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(1, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(1, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(1, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(1, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(1, 2, 4)
-//     }
-//   } else {
-//     printf("Error: mass_trans_multiply_2_cpt is only for 3D and 2D data\n");
-//   }
-//   #undef LPK
-// }
-
-// template <typename T, SIZE R, SIZE C, SIZE F>
-// __global__ void _lpk_reo_3_3d(SIZE nr, SIZE nc_c, SIZE nf_c, SIZE nr_c, T
-// *ddist_r,
-//                               T *dratio_r, T *dv1, SIZE lddv11, SIZE lddv12,
-//                               T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE
-//                               lddw1, SIZE lddw2) {
-
-//   // bool debug = false;
-//   // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0 &&
-//   // threadIdx.y == 0 && threadIdx.x == 0 ) debug = true;
-
-//   // bool debug2 = false;
-//   // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0)
-//   // debug2 = true;
-
-//   bool PADDING = (nr % 2 == 0);
-//   T *sm = SharedMemory<T>();
-//   SIZE ldsm1 = F;
-//   SIZE ldsm2 = C;
-//   T *v_sm = sm;
-//   T *dist_r_sm = sm + ldsm1 * ldsm2 * (R * 2 + 3);
-//   T *ratio_r_sm = dist_r_sm + (R * 2 + 3);
-
-//   SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z;
-//   SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y;
-//   SIZE f_gl = blockIdx.x * blockDim.x + threadIdx.x;
-
-//   // if (debug) printf("debugging gl: %d %d %d\n", r_gl, c_gl, f_gl);
-
-//   SIZE blockId = blockIdx.z;
-
-//   SIZE r_sm = threadIdx.z;
-//   SIZE c_sm = threadIdx.y;
-//   SIZE f_sm = threadIdx.x;
-
-//   SIZE actual_R = R;
-//   if (nr_c - blockIdx.z * blockDim.z < R) {
-//     actual_R = nr_c - blockIdx.z * blockDim.z;
-//   }
-//   // if (nr_c % 2 == 1){
-//   //   if(nr_c-1 - blockIdx.z * blockDim.z < R) { actual_R = nr_c - 1 -
-//   //   blockIdx.z * blockDim.z; }
-//   // } else {
-//   //   if(nr_c - blockIdx.z * blockDim.z < R) { actual_R = nr_c - blockIdx.z
-//   *
-//   //   blockDim.z; }
-//   // }
-
-//   // if (debug) printf("actual_R %d\n", actual_R);
-
-//   // if (debug) printf("RCF: %d %d %d\n", R, C, F);
-//   if (r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//     // if (debug) printf("load front vsm[%d]: %f <- %d %d %d\n", r_sm * 2 +
-//     2,
-//     // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl);
-//     v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 2, c_sm, f_sm)] =
-//         dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)];
-
-//     if (r_sm == actual_R - 1) {
-//       if (r_gl + 1 < nr_c) {
-//         // if (debug) printf("load front+1 vsm[%d]: %f <- %d %d %d\n",
-//         actual_R
-//         // * 2 + 2, dv1[get_idx(lddv11, lddv12, blockId * R + actual_R, c_gl,
-//         // f_gl)], blockId * R + actual_R, c_gl, f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, actual_R * 2 + 2, c_sm, f_sm)] =
-//             dv1[get_idx(lddv11, lddv12, r_gl + 1, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load front+1 vsm[%d]: 0.0\n", actual_R * 2 +
-//         2); v_sm[get_idx(ldsm1, ldsm2, actual_R * 2 + 2, c_sm, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (r_sm == 0) {
-//       if (r_gl >= 1) {
-//         // if (debug) printf("load front-1 vsm[0]: %f <- %d %d %d\n",
-//         // dv1[get_idx(lddv11, lddv12, r_gl-1, c_gl, f_gl)], r_gl-1, c_gl,
-//         // f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] =
-//             dv1[get_idx(lddv11, lddv12, r_gl - 1, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load front-1 vsm[0]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (!PADDING) {
-//       if (r_gl < nr_c - 1) {
-//         // if (debug) printf("load back vsm[%d]: %f <- %d %d %d\n", r_sm * 2
-//         +
-//         // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] =
-//             dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load back vsm[%d]: 0.0\n", r_sm * 2 + 3);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = 0.0;
-//       }
-//     } else {
-//       if (r_gl < nr_c - 2) {
-//         // if (debug) printf("load back vsm[%d]: %f <- %d %d %d\n", r_sm * 2
-//         +
-//         // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl,
-//         f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] =
-//             dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)];
-//       } else {
-//         // if (debug) printf("load back vsm[%d]: 0.0\n", r_sm * 2 + 3);
-//         v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = 0.0;
-//       }
-//     }
-
-//     if (r_gl >= 1 &&
-//         (PADDING && r_gl - 1 < nr_c - 2 || !PADDING && r_gl < nr_c )) {
-//       // if (blockId > 0) {
-//       if (r_sm == 0) {
-//         // if (debug) printf("load back-1 vsm[1]: %f <- %d %d %d\n",
-//         // dv2[get_idx(lddv11, lddv12, r_gl-1, c_gl, f_gl)], r_gl-1, c_gl,
-//         // f_gl);
-//         v_sm[get_idx(ldsm1, ldsm2, 1, c_sm, f_sm)] =
-//             dv2[get_idx(lddv11, lddv12, r_gl - 1, c_gl, f_gl)];
-//       }
-//     } else {
-//       if (r_sm == 0) {
-//         // if (debug) printf("load back-1 vsm[1]: 0.0\n");
-//         v_sm[get_idx(ldsm1, ldsm2, 1, c_sm, f_sm)] = 0.0;
-//       }
-//     }
-//   }
-
-//   // load dist/ratio using f_sm for better performance
-//   // assumption F >= R
-//   // if (debug2) printf("actual_R: %u\n", actual_R);
-//   if (r_sm == 0 && c_sm == 0 && f_sm < actual_R) {
-//     // if (debug2) printf(" RCF (%u %u %u)blockid(%u) fsm(%u) nr(%u)\n", R,
-//     C, F, blockId, blockId * R * 2 + f_sm, nr); if (blockId * R * 2 + f_sm <
-//     nr) {
-
-//       dist_r_sm[2 + f_sm] = ddist_r[blockId * R * 2 + f_sm];
-//       // if (debug2 ) printf("load dist 1 [%d]: %f [%d]\n", 2 + f_sm,
-//       // dist_r_sm[2 + f_sm], blockId * R * 2 + f_sm);
-//       ratio_r_sm[2 + f_sm] = dratio_r[blockId * R * 2 + f_sm];
-//       // if (debug2 )printf("load ratio 1 [%d]: %f [%d]\n", 2 + f_sm,
-//       // ratio_r_sm[2 + f_sm], blockId * R * 2 + f_sm);
-//     } else {
-//       dist_r_sm[2 + f_sm] = 0.0;
-//       ratio_r_sm[2 + f_sm] = 0.0;
-//     }
-//     if (blockId * R * 2 + actual_R + f_sm < nr) {
-//       dist_r_sm[2 + actual_R + f_sm] =
-//           ddist_r[blockId * R * 2 + actual_R + f_sm];
-//       // if (debug2 )printf("load dist 2 [%d]: %f [%d]\n", 2 + actual_R +
-//       f_sm,
-//       // dist_r_sm[2 + actual_R + f_sm], blockId * R * 2 + actual_R + f_sm);
-//       ratio_r_sm[2 + actual_R + f_sm] =
-//           dratio_r[blockId * R * 2 + actual_R + f_sm];
-//       // if (debug2 )printf("load ratio 2 [%d]: %f [%d]\n", 2 + actual_R +
-//       f_sm,
-//       // ratio_r_sm[2 + actual_R + f_sm], blockId * R * 2 + actual_R + f_sm);
-//     } else {
-//       dist_r_sm[2 + actual_R + f_sm] = 0.0;
-//       ratio_r_sm[2 + actual_R + f_sm] = 0.0;
-//     }
-//   }
-
-//   if (blockId > 0) {
-//     if (f_sm < 2) {
-//       dist_r_sm[f_sm] = ddist_r[blockId * R * 2 - 2 + f_sm];
-//       // if (debug2 )printf("load dist -1 [%d]: %f [%d]\n", f_sm,
-//       // dist_r_sm[f_sm], blockId * R * 2 - 2 + f_sm);
-//       ratio_r_sm[f_sm] = dratio_r[blockId * R * 2 - 2 + f_sm];
-//       // if (debug2 )printf("load ratio -1 [%d]: %f [%d]\n", f_sm,
-//       // ratio_r_sm[f_sm], blockId * R * 2 - 2 + f_sm);
-//     }
-//   } else {
-//     if (f_sm < 2) {
-//       dist_r_sm[f_sm] = 0.0;
-//       ratio_r_sm[f_sm] = 0.0;
-//     }
-//   }
-
-//   __syncthreads();
-
-//   int adjusted_nr_c = nr_c;
-//   if (r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) {
-//     T h1 = dist_r_sm[r_sm * 2];
-//     T h2 = dist_r_sm[r_sm * 2 + 1];
-//     T h3 = dist_r_sm[r_sm * 2 + 2];
-//     T h4 = dist_r_sm[r_sm * 2 + 3];
-//     T r1 = ratio_r_sm[r_sm * 2];
-//     T r2 = ratio_r_sm[r_sm * 2 + 1];
-//     T r3 = ratio_r_sm[r_sm * 2 + 2];
-//     T r4 = 1 - r3;
-//     T a = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2, c_sm, f_sm)];
-//     T b = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 1, c_sm, f_sm)];
-//     T c = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 2, c_sm, f_sm)];
-//     T d = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)];
-//     T e = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 4, c_sm, f_sm)];
-
-//     // __syncthreads();
-//     // if (debug) {
-//     //   printf("r_sm(%d) %f %f %f %f %f\n",r_sm, a,b,c,d,e);
-//     //   printf("r_sm_h(%d) %f %f %f %f\n",r_sm, h1,h2,h3,h4);
-//     //   printf("r_sm_r(%d) %f %f %f %f\n",r_sm, r1,r2,r3,r4);
-//     // }
-//     // __syncthreads();
-
-//     // T tb = a * h1/6 + b * 2 * (h1+h2)/6 + c * h2/6;
-//     // T tc = b * h2/6 + c * 2 * (h2+h3)/6 + d * h3/6;
-//     // T td = c * h3/6 + d * 2 * (h3+h4)/6 + e * h4/6;
-
-//     // if (debug) printf("f_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc,
-//     // td, tc+tb * r1 + td * r4);
-
-//     // tc += tb * r1 + td * r4;
-
-//     dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] =
-//         mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4);
-
-//     // if (debug) printf("store[%d %d %d] %f (%f)\n", r_gl, c_gl, f_gl,
-//     // mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4),
-//     //                 mass_trans(a, b, c, (T)0.0, (T)0.0, h1, (T)0.0,
-//     (T)0.0,
-//     //                 h4, r1, r2, (T)0.0, (T)0.0));
-//     // // printf("%d %d %d\n", r_gl, c_gl, f_gl);
-//     // if (blockId * R + R == nr-1) {
-//     // if (r_gl+1 == nr_c - 1) {
-//     // if (r_gl+1 == nr_c - 1) {
-//     //   // T te = h4 * d + 2 * h4 * e;
-//     //   // te += td * r3;
-//     //   dw[get_idx(lddw1, lddw2, blockId * R + actual_R, c_gl, f_gl)] =
-//     //     mass_trans(c, d, e, (T)0.0, (T)0.0,
-//     //       h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0);
-
-//     //   if (debug) printf("store-last[%d %d %d] %f\n", blockId * R +
-//     actual_R,
-//     //   c_gl, f_gl,
-//     //             mass_trans(c, d, e, (T)0.0, (T)0.0,
-//     //       h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0));
-//     // }
-//     //}
-//   }
-// }
-
-// template <DIM D, typename T, SIZE R, SIZE C, SIZE F>
-// void lpk_reo_3_3d_adaptive_launcher(Handle<D, T> &handle, SIZE nr, SIZE nc_c,
-//                                     SIZE nf_c, SIZE nr_c, T *ddist_r, T
-//                                     *dratio_r, T *dv1, SIZE lddv11, SIZE
-//                                     lddv12, T *dv2, SIZE lddv21, SIZE lddv22,
-//                                     T *dw, SIZE lddw1, SIZE lddw2, int
-//                                     queue_idx) {
-
-//   SIZE total_thread_z = nr_c;
-//   // if (nr_c % 2 == 1){ total_thread_z = nr_c - 1; }
-//   // else { total_thread_z = nr_c; }
-//   SIZE total_thread_y = nc_c;
-//   SIZE total_thread_x = nf_c;
-
-//   SIZE tbx, tby, tbz, gridx, gridy, gridz;
-//   dim3 threadsPerBlock, blockPerGrid;
-//   size_t sm_size;
-
-//   tbz = R;
-//   tby = C;
-//   tbx = F;
-//   sm_size = ((R * 2 + 3) * C * F + (R * 2 + 3) * 2) * sizeof(T);
-//   gridz = ceil((float)total_thread_z / tbz);
-//   gridy = ceil((float)total_thread_y / tby);
-//   gridx = ceil((float)total_thread_x / tbx);
-//   threadsPerBlock = dim3(tbx, tby, tbz);
-//   blockPerGrid = dim3(gridx, gridy, gridz);
-
-//   // printf("nr: %d nc_c: %d, nf_c: %d, nr_c: %d\n", nr, nc_c, nf_c, nr_c);
-//   // printf("tb: %d %d %d, grid: %d %d %d\n", tbx, tby, tbz, gridx, gridy,
-//   // gridz);
-//   _lpk_reo_3_3d<T, R, C, F><<<blockPerGrid, threadsPerBlock, sm_size,
-//                               *(cudaStream_t *)handle.get(queue_idx)>>>(
-//       nr, nc_c, nf_c, nr_c, ddist_r, dratio_r, dv1, lddv11, lddv12, dv2,
-//       lddv21, lddv22, dw, lddw1, lddw2);
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void lpk_reo_3_3d(Handle<D, T> &handle, SIZE nr, SIZE nc_c, SIZE nf_c, SIZE
-// nr_c,
-//                   T *ddist_r, T *dratio_r, T *dv1, SIZE lddv11, SIZE lddv12,
-//                   T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE lddw1, SIZE
-//                   lddw2, int queue_idx, int config) {
-
-// #define LPK(R, C, F)                                                           \
-//   {                                                                            \
-//     lpk_reo_3_3d_adaptive_launcher<D, T, R, C, F>(                             \
-//         handle, nr, nc_c, nf_c, nr_c, ddist_r, dratio_r, dv1, lddv11, lddv12,  \
-//         dv2, lddv21, lddv22, dw, lddw1, lddw2, queue_idx);                     \
-//   }
-//   bool profile = false;
-//   if (handle.profile_kernels) {
-//     profile = true;
-//   }
-//   if (D == 3) {
-//     if (profile || config == 6) {
-//       LPK(2, 2, 128)
-//     }
-//     if (profile || config == 5) {
-//       LPK(2, 2, 64)
-//     }
-//     if (profile || config == 4) {
-//       LPK(2, 2, 32)
-//     }
-//     if (profile || config == 3) {
-//       LPK(4, 4, 16)
-//     }
-//     if (profile || config == 2) {
-//       LPK(8, 8, 8)
-//     }
-//     if (profile || config == 1) {
-//       LPK(4, 4, 4)
-//     }
-//     if (profile || config == 0) {
-//       LPK(2, 2, 2)
-//     }
-//   } else {
-//     printf("Error: mass_trans_multiply_3_cpt is only for 3D data\n");
-//   }
-
-// #undef LPK
-// }
-
 } // namespace mgard_x
 
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.h b/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.h
index d422509906..c59b246d2a 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.h
+++ b/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.h
@@ -9,50 +9,64 @@
 #define MGARD_X_DATA_REFACTORING
 
 // #include "Common.h"
-#include "../../Hierarchy.h"
+#include "../../Hierarchy/Hierarchy.h"
 #include "../../RuntimeX/RuntimeXPublic.h"
 
 namespace mgard_x {
 
-// template <DIM D, typename T, typename DeviceType>
-// void calc_coeff_pointers(Hierarchy<D, T> &hierarchy, DIM curr_dims[3], DIM l,
-// SubArray<D, T> doutput,
-//                          SubArray<D, T> &dcoarse,
-//                          SubArray<D, T> &dcoeff_f,
-//                          SubArray<D, T> &dcoeff_c,
-//                          SubArray<D, T> &dcoeff_r,
-//                          SubArray<D, T> &dcoeff_cf,
-//                          SubArray<D, T> &dcoeff_rf,
-//                          SubArray<D, T> &dcoeff_rc,
-//                          SubArray<D, T> &dcoeff_rcf);
-
-// template <DIM D, typename T, typename DeviceType>
-// void calc_coefficients_3d(Hierarchy<D, T> &hierarchy, SubArray<D, T> dinput,
-//                         SubArray<D, T> &doutput, SIZE l, int queue_idx);
-
-// template <DIM D, typename T, typename DeviceType>
-// void coefficients_restore_3d(Hierarchy<D, T> &hierarchy, SubArray<D, T>
-// dinput,
-//                         SubArray<D, T> &doutput, SIZE l, int queue_idx);
-
-// template <DIM D, typename T, typename DeviceType>
-// void calc_correction_3d(Hierarchy<D, T> &hierarchy, SubArray<D, T> dcoeff,
-//                         SubArray<D, T> &dcorrection, SIZE l, int queue_idx);
-
-// template <DIM D, typename T, typename DeviceType>
-// void calc_coefficients_nd(Hierarchy<D, T> &hierarchy, SubArray<D, T> dinput1,
-//                           SubArray<D, T> dinput2,
-//                         SubArray<D, T> &doutput, SIZE l, int queue_idx);
-
-// template <DIM D, typename T, typename DeviceType>
-// void coefficients_restore_nd(Hierarchy<D, T> &hierarchy, SubArray<D, T>
-// dinput1,
-//                              SubArray<D, T> dinput2,
-//                              SubArray<D, T> &doutput, SIZE l, int queue_idx);
-
-// template <DIM D, typename T, typename DeviceType>
-// void calc_correction_nd(Hierarchy<D, T> &hierarchy, SubArray<D, T> dcoeff,
-//                         SubArray<D, T> &dcorrection, SIZE l, int queue_idx);
+static bool multidim_refactoring_store = false;
+static bool multidim_refactoring_verify = false;
+static bool multidim_refactoring_debug_print = false;
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCoefficients3D(Hierarchy<D, T, DeviceType> &hierarchy,
+                        SubArray<D, T, DeviceType> dinput,
+                        SubArray<D, T, DeviceType> &doutput, SIZE l,
+                        int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void CoefficientsRestore3D(Hierarchy<D, T, DeviceType> &hierarchy,
+                           SubArray<D, T, DeviceType> dinput,
+                           SubArray<D, T, DeviceType> &doutput, SIZE l,
+                           int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCorrection3D(Hierarchy<D, T, DeviceType> &hierarchy,
+                      SubArray<D, T, DeviceType> dcoeff,
+                      SubArray<D, T, DeviceType> &dcorrection, SIZE l,
+                      int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCoefficientsND(Hierarchy<D, T, DeviceType> &hierarchy,
+                        SubArray<D, T, DeviceType> dinput1,
+                        SubArray<D, T, DeviceType> dinput2,
+                        SubArray<D, T, DeviceType> &doutput, SIZE l,
+                        int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void CoefficientsRestoreND(Hierarchy<D, T, DeviceType> &hierarchy,
+                           SubArray<D, T, DeviceType> dinput1,
+                           SubArray<D, T, DeviceType> dinput2,
+                           SubArray<D, T, DeviceType> &doutput, SIZE l,
+                           int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCorrectionND(Hierarchy<D, T, DeviceType> &hierarchy,
+                      SubArray<D, T, DeviceType> dcoeff,
+                      SubArray<D, T, DeviceType> &dcorrection, SIZE l,
+                      int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void CopyND(SubArray<D, T, DeviceType> dinput,
+            SubArray<D, T, DeviceType> &doutput, int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void AddND(SubArray<D, T, DeviceType> dinput,
+           SubArray<D, T, DeviceType> &doutput, int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void SubtractND(SubArray<D, T, DeviceType> dinput,
+                SubArray<D, T, DeviceType> &doutput, int queue_idx);
 
 template <DIM D, typename T, typename DeviceType>
 void decompose(Hierarchy<D, T, DeviceType> &hierarchy,
diff --git a/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp b/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp
index d994ef7b21..8c299be04b 100644
--- a/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp
+++ b/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp
@@ -5,1684 +5,15 @@
  * Date: March 17, 2022
  */
 
-#include "../../Hierarchy.hpp"
+#include "../../Hierarchy/Hierarchy.hpp"
 #include "../../RuntimeX/RuntimeX.h"
-// #include "SubArray.hpp"
-// #include "DeviceAdapters/DeviceAdapterCuda.h"
-
-// #include "DataRefactoring/Coefficient/GridProcessingKernel.h"
-#include "Coefficient/GridProcessingKernel.hpp"
-// #include "cuda/DataRefactoring/Coefficient/GridProcessingKernel2.hpp"
-
-// #include "DataRefactoring/Coefficient/GridProcessingKernel3D.h"
-#include "Coefficient/GridProcessingKernel3D.hpp"
-// #include "cuda/DataRefactoring/Coefficient/GridProcessingKernel2.hpp"
-// #include "DataRefactoring/Correction/IterativeProcessingKernel.h"
-// #include "DataRefactoring/Correction/IterativeProcessingKernel3D.h"
-#include "Correction/IterativeProcessingKernel.hpp"
-#include "Correction/IterativeProcessingKernel3D.hpp"
-// #include "LevelwiseProcessingKernel.h"
-#include "Correction/LevelwiseProcessingKernel.hpp"
-// #include "DataRefactoring/Correction/LinearProcessingKernel.h"
-#include "Correction/LinearProcessingKernel.hpp"
-// #include "DataRefactoring/Correction/LinearProcessingKernel3D.h"
-#include "Correction/LinearProcessingKernel3D.hpp"
 
 #include "DataRefactoring.h"
 
-// #include "cuda/Testing/ReorderToolsGPU.hpp"
-
 #include <iostream>
 
-#include <chrono>
 namespace mgard_x {
 
-static bool store = false;
-static bool verify = false;
-static bool debug_print = false;
-
-template <typename SubArrayType>
-void CompareSubarray4D(SubArrayType subArray1, SubArrayType subArray2) {
-  if (SubArrayType::NumDims != 4) {
-    std::cout << log::log_err
-              << "CompareSubarray4D expects 4D subarray type.\n";
-    exit(-1);
-  }
-  if (subArray1.getShape(3) != subArray2.getShape(3)) {
-    std::cout << log::log_err << "CompareSubarray4D mismatch 4D size.\n";
-    exit(-1);
-  }
-
-  using T = typename SubArrayType::DataType;
-  SIZE idx[4] = {0, 0, 0, 0};
-  for (SIZE i = 0; i < subArray1.getShape(3); i++) {
-    idx[3] = i;
-    SubArrayType temp1 = subArray1;
-    SubArrayType temp2 = subArray2;
-    temp1.offset(3, i);
-    temp2.offset(3, i);
-    CompareSubarray("4D = " + std::to_string(i), temp1.Slice3D(0, 1, 2),
-                    temp2.Slice3D(0, 1, 2));
-  }
-}
-
-template <typename SubArrayType>
-void PrintSubarray4D(std::string name, SubArrayType subArray1) {
-  if (SubArrayType::NumDims != 4) {
-    std::cout << log::log_err << "PrintSubarray4D expects 4D subarray type.\n";
-    exit(-1);
-  }
-  std::cout << name << "\n";
-  using T = typename SubArrayType::DataType;
-  SIZE idx[4] = {0, 0, 0, 0};
-  for (SIZE i = 0; i < subArray1.getShape(3); i++) {
-    idx[3] = i;
-    SubArrayType temp1 = subArray1;
-    temp1.offset(3, i);
-    PrintSubarray("i = " + std::to_string(i), temp1.Slice3D(0, 1, 2));
-  }
-}
-
-template <DIM D, typename T, typename DeviceType>
-void calc_coeff_pointers(
-    Hierarchy<D, T, DeviceType> &hierarchy, DIM curr_dims[3], DIM l,
-    SubArray<D, T, DeviceType> doutput, SubArray<D, T, DeviceType> &dcoarse,
-    SubArray<D, T, DeviceType> &dcoeff_f, SubArray<D, T, DeviceType> &dcoeff_c,
-    SubArray<D, T, DeviceType> &dcoeff_r, SubArray<D, T, DeviceType> &dcoeff_cf,
-    SubArray<D, T, DeviceType> &dcoeff_rf,
-    SubArray<D, T, DeviceType> &dcoeff_rc,
-    SubArray<D, T, DeviceType> &dcoeff_rcf) {
-
-  SIZE n[3];
-  SIZE nn[3];
-  for (DIM d = 0; d < 3; d++) {
-    n[d] = hierarchy.dofs[curr_dims[d]][l];
-    nn[d] = hierarchy.dofs[curr_dims[d]][l + 1];
-  }
-
-  dcoarse = doutput;
-  dcoarse.resize(curr_dims[0], nn[0]);
-  dcoarse.resize(curr_dims[1], nn[1]);
-  dcoarse.resize(curr_dims[2], nn[2]);
-
-  dcoeff_f = doutput;
-  dcoeff_f.offset(curr_dims[0], nn[0]);
-  dcoeff_f.resize(curr_dims[0], n[0] - nn[0]);
-  dcoeff_f.resize(curr_dims[1], nn[1]);
-  dcoeff_f.resize(curr_dims[2], nn[2]);
-
-  dcoeff_c = doutput;
-  dcoeff_c.offset(curr_dims[1], nn[1]);
-  dcoeff_c.resize(curr_dims[0], nn[0]);
-  dcoeff_c.resize(curr_dims[1], n[1] - nn[1]);
-  dcoeff_c.resize(curr_dims[2], nn[2]);
-
-  dcoeff_r = doutput;
-  dcoeff_r.offset(curr_dims[2], nn[2]);
-  dcoeff_r.resize(curr_dims[0], nn[0]);
-  dcoeff_r.resize(curr_dims[1], nn[1]);
-  dcoeff_r.resize(curr_dims[2], n[2] - nn[2]);
-
-  dcoeff_cf = doutput;
-  dcoeff_cf.offset(curr_dims[0], nn[0]);
-  dcoeff_cf.offset(curr_dims[1], nn[1]);
-  dcoeff_cf.resize(curr_dims[0], n[0] - nn[0]);
-  dcoeff_cf.resize(curr_dims[1], n[1] - nn[1]);
-  dcoeff_cf.resize(curr_dims[2], nn[2]);
-
-  dcoeff_rf = doutput;
-  dcoeff_rf.offset(curr_dims[0], nn[0]);
-  dcoeff_rf.offset(curr_dims[2], nn[2]);
-  dcoeff_rf.resize(curr_dims[0], n[0] - nn[0]);
-  dcoeff_rf.resize(curr_dims[1], nn[1]);
-  dcoeff_rf.resize(curr_dims[2], n[2] - nn[2]);
-
-  dcoeff_rc = doutput;
-  dcoeff_rc.offset(curr_dims[1], nn[1]);
-  dcoeff_rc.offset(curr_dims[2], nn[2]);
-  dcoeff_rc.resize(curr_dims[0], nn[0]);
-  dcoeff_rc.resize(curr_dims[1], n[1] - nn[1]);
-  dcoeff_rc.resize(curr_dims[2], n[2] - nn[2]);
-
-  dcoeff_rcf = doutput;
-  dcoeff_rcf.offset(curr_dims[0], nn[0]);
-  dcoeff_rcf.offset(curr_dims[1], nn[1]);
-  dcoeff_rcf.offset(curr_dims[2], nn[2]);
-  dcoeff_rcf.resize(curr_dims[0], n[0] - nn[0]);
-  dcoeff_rcf.resize(curr_dims[1], n[1] - nn[1]);
-  dcoeff_rcf.resize(curr_dims[2], n[2] - nn[2]);
-}
-
-template <DIM D, typename T, typename DeviceType>
-void calc_coefficients_3d(Hierarchy<D, T, DeviceType> &hierarchy,
-                          SubArray<D, T, DeviceType> dinput,
-                          SubArray<D, T, DeviceType> &doutput, SIZE l,
-                          int queue_idx) {
-
-  int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1);
-  int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1);
-
-  std::string prefix = "decomp_";
-  if (sizeof(T) == sizeof(double))
-    prefix += "d_";
-  if (sizeof(T) == sizeof(float))
-    prefix += "f_";
-  for (int d = 0; d < D; d++)
-    prefix += std::to_string(hierarchy.shape[d]) + "_";
-
-  dinput.project(0, 1, 2);
-  doutput.project(0, 1, 2);
-
-  SIZE f = hierarchy.dofs[0][l];
-  SIZE c = hierarchy.dofs[1][l];
-  SIZE r = hierarchy.dofs[2][l];
-  SIZE ff = hierarchy.dofs[0][l + 1];
-  SIZE cc = hierarchy.dofs[1][l + 1];
-  SIZE rr = hierarchy.dofs[2][l + 1];
-
-  SubArray<D, T, DeviceType> dcoarse = doutput;
-  dcoarse.resize({ff, cc, rr});
-  SubArray<D, T, DeviceType> dcoeff_f = doutput;
-  dcoeff_f.offset({ff, 0, 0});
-  dcoeff_f.resize({f - ff, cc, rr});
-  SubArray<D, T, DeviceType> dcoeff_c = doutput;
-  dcoeff_c.offset({0, cc, 0});
-  dcoeff_c.resize({ff, c - cc, rr});
-  SubArray<D, T, DeviceType> dcoeff_r = doutput;
-  dcoeff_r.offset({0, 0, rr});
-  dcoeff_r.resize({ff, cc, r - rr});
-  SubArray<D, T, DeviceType> dcoeff_cf = doutput;
-  dcoeff_cf.offset({ff, cc, 0});
-  dcoeff_cf.resize({f - ff, c - cc, rr});
-  SubArray<D, T, DeviceType> dcoeff_rf = doutput;
-  dcoeff_rf.offset({ff, 0, rr});
-  dcoeff_rf.resize({f - ff, cc, r - rr});
-  SubArray<D, T, DeviceType> dcoeff_rc = doutput;
-  dcoeff_rc.offset({0, cc, rr});
-  dcoeff_rc.resize({ff, c - cc, r - rr});
-  SubArray<D, T, DeviceType> dcoeff_rcf = doutput;
-  dcoeff_rcf.offset({ff, cc, rr});
-  dcoeff_rcf.resize({f - ff, c - cc, r - rr});
-
-  // SubArray<1, T, DeviceType> ratio_r({hierarchy.dofs[2][l]},
-  // hierarchy.ratio[2][l]); SubArray<1, T, DeviceType>
-  // ratio_c({hierarchy.dofs[1][l]}, hierarchy.ratio[1][l]); SubArray<1, T,
-  // DeviceType> ratio_f({hierarchy.dofs[0][l]}, hierarchy.ratio[0][l]);
-
-  T *null = NULL;
-  GpkReo3D<D, T, DeviceType>().Execute(
-      hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l],
-      hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
-      hierarchy.dofs[0][l + 1], SubArray(hierarchy.ratio_array[2][l]),
-      SubArray(hierarchy.ratio_array[1][l]),
-      SubArray(hierarchy.ratio_array[0][l]),
-      // ratio_r, ratio_c, ratio_f,
-      dinput, dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf,
-      dcoeff_rc, dcoeff_rcf, queue_idx);
-  // hierarchy.sync_all();
-  //  if (debug_print) {
-  //   PrintSubarray("after pi_Ql_reo", doutput);
-  // }
-
-  // {
-  //   std::vector<SIZE> shape2_rev(D);
-  //   std::vector<SIZE> shape2_pad_rev(D);
-  //   for (int i = 0; i < D; i++) {
-  //     shape2_rev[i] = hierarchy.dofs[D-1-i][0];
-  //     shape2_pad_rev[i] = hierarchy.dofs[D-1-i][0] + 2;
-  //   }
-  //   mgard_cuda::Array<D, T> input2(shape2_rev);
-  //   mgard_cuda::Array<D, T> work2(shape2_pad_rev);
-
-  //   MemoryManager<DeviceType>::CopyND(input2.get_dv(),
-  //   in_array2.get_ldvs_h()[0],
-  //                                   dinput.data(), in_array.getLd(0),
-  //                                   hierarchy.dofs[0][0],
-  //                                   hierarchy.dofs[1][0] *
-  //                                   hierarchy.linearized_depth, 0);
-
-  //   gpk_reo_3d(
-  //       hierarchy, hierarchy.dofs[2][l], hierarchy.dofs[1][l],
-  //       hierarchy.dofs[0][l], hierarchy.ratio[2][l], hierarchy.ratio[1][l],
-  //       hierarchy.ratio[0][l], dinput.data(), dinput.getLddv1(),
-  //       dinput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(),
-  //       dcoarse.getLddv2(), dcoeff_f.data(), dcoeff_f.getLddv1(),
-  //       dcoeff_f.getLddv2(), dcoeff_c.data(), dcoeff_c.getLddv1(),
-  //       dcoeff_c.getLddv2(), dcoeff_r.data(), dcoeff_r.getLddv1(),
-  //       dcoeff_r.getLddv2(), dcoeff_cf.data(), dcoeff_cf.getLddv1(),
-  //       dcoeff_cf.getLddv2(), dcoeff_rf.data(), dcoeff_rf.getLddv1(),
-  //       dcoeff_rf.getLddv2(), dcoeff_rc.data(), dcoeff_rc.getLddv1(),
-  //       dcoeff_rc.getLddv2(), dcoeff_rcf.data(), dcoeff_rcf.getLddv1(),
-  //       dcoeff_rcf.getLddv2(), queue_idx,
-  //       hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-  // }
-
-  verify_matrix_cuda(
-      hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l],
-      doutput.data(), doutput.getLd(0), doutput.getLd(1), doutput.getLd(0),
-      prefix + "gpk_reo_3d" + "_level_" + std::to_string(l), store, verify);
-
-  if (debug_print) {
-    PrintSubarray("after pi_Ql_reo", doutput);
-  }
-}
-
-template <DIM D, typename T, typename DeviceType>
-void coefficients_restore_3d(Hierarchy<D, T, DeviceType> &hierarchy,
-                             SubArray<D, T, DeviceType> dinput,
-                             SubArray<D, T, DeviceType> &doutput, SIZE l,
-                             int queue_idx) {
-
-  int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1);
-  int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1);
-
-  std::string prefix = "decomp_";
-  if (sizeof(T) == sizeof(double))
-    prefix += "d_";
-  if (sizeof(T) == sizeof(float))
-    prefix += "f_";
-  for (int d = 0; d < D; d++)
-    prefix += std::to_string(hierarchy.shape[d]) + "_";
-
-  dinput.project(0, 1, 2);
-  doutput.project(0, 1, 2);
-
-  SIZE f = hierarchy.dofs[0][l];
-  SIZE c = hierarchy.dofs[1][l];
-  SIZE r = hierarchy.dofs[2][l];
-  SIZE ff = hierarchy.dofs[0][l + 1];
-  SIZE cc = hierarchy.dofs[1][l + 1];
-  SIZE rr = hierarchy.dofs[2][l + 1];
-
-  SubArray<D, T, DeviceType> dcoarse = dinput;
-  dcoarse.resize({ff, cc, rr});
-  SubArray<D, T, DeviceType> dcoeff_f = dinput;
-  dcoeff_f.offset({ff, 0, 0});
-  dcoeff_f.resize({f - ff, cc, rr});
-  SubArray<D, T, DeviceType> dcoeff_c = dinput;
-  dcoeff_c.offset({0, cc, 0});
-  dcoeff_c.resize({ff, c - cc, rr});
-  SubArray<D, T, DeviceType> dcoeff_r = dinput;
-  dcoeff_r.offset({0, 0, rr});
-  dcoeff_r.resize({ff, cc, r - rr});
-  SubArray<D, T, DeviceType> dcoeff_cf = dinput;
-  dcoeff_cf.offset({ff, cc, 0});
-  dcoeff_cf.resize({f - ff, c - cc, rr});
-  SubArray<D, T, DeviceType> dcoeff_rf = dinput;
-  dcoeff_rf.offset({ff, 0, rr});
-  dcoeff_rf.resize({f - ff, cc, r - rr});
-  SubArray<D, T, DeviceType> dcoeff_rc = dinput;
-  dcoeff_rc.offset({0, cc, rr});
-  dcoeff_rc.resize({ff, c - cc, r - rr});
-  SubArray<D, T, DeviceType> dcoeff_rcf = dinput;
-  dcoeff_rcf.offset({ff, cc, rr});
-  dcoeff_rcf.resize({f - ff, c - cc, r - rr});
-
-  // SubArray<1, T, DeviceType> ratio_r({hierarchy.dofs[2][l]},
-  // hierarchy.ratio[2][l]); SubArray<1, T, DeviceType>
-  // ratio_c({hierarchy.dofs[1][l]}, hierarchy.ratio[1][l]); SubArray<1, T,
-  // DeviceType> ratio_f({hierarchy.dofs[0][l]}, hierarchy.ratio[0][l]);
-
-  GpkRev3D<D, T, DeviceType>().Execute(
-      hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l],
-      hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
-      hierarchy.dofs[0][l + 1], SubArray(hierarchy.ratio_array[2][l]),
-      SubArray(hierarchy.ratio_array[1][l]),
-      SubArray(hierarchy.ratio_array[0][l]),
-      // ratio_r, ratio_c, ratio_f,
-      doutput, dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf,
-      dcoeff_rc, dcoeff_rcf, 0, 0, 0, hierarchy.dofs[2][l],
-      hierarchy.dofs[1][l], hierarchy.dofs[0][l], queue_idx);
-
-  T *null = NULL;
-  // gpk_rev_3d(
-  //     hierarchy, hierarchy.dofs[2][l], hierarchy.dofs[1][l],
-  //     hierarchy.dofs[0][l], hierarchy.ratio[2][l], hierarchy.ratio[1][l],
-  //     hierarchy.ratio[0][l], doutput.data(), doutput.getLddv1(),
-  //     doutput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(),
-  //     dcoarse.getLddv2(),
-  //     // null, ldvs_h[0], ldvs_h[1],
-  //     dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(),
-  //     // null, ldvs_h[0], ldvs_h[1],
-  //     dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(),
-  //     // null, ldvs_h[0], ldvs_h[1],
-  //     dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(),
-  //     // null, ldvs_h[0], ldvs_h[1],
-  //     dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(),
-  //     // null, ldvs_h[0], ldvs_h[1],
-  //     dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(),
-  //     // null, ldvs_h[0], ldvs_h[1],
-  //     dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(),
-  //     // null, ldvs_h[0], ldvs_h[1],
-  //     dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(),
-  //     // null, ldvs_h[0], ldvs_h[1],
-  //     0, 0, 0, hierarchy.dofs[2][l], hierarchy.dofs[1][l],
-  //     hierarchy.dofs[0][l], queue_idx,
-  //     hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-  // hierarchy.sync(0);
-  verify_matrix_cuda(
-      hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l],
-      doutput.data(), doutput.getLd(0), doutput.getLd(1), doutput.getLd(0),
-      prefix + "gpk_rev_3d" + "_level_" + std::to_string(l), store, verify);
-
-  // gpk_rev<D, T, D, true, false, 1>(hierarchy,
-  //             shape, shape_c, hierarchy.ldws_h, ldvs_h, unprocessed_dims,
-  //             2, 1, 0,
-  //             hierarchy.ratio[2][l], hierarchy.ratio[1][l],
-  //             hierarchy.ratio[0][l], hierarchy.dw, hierarchy.ldws_h[0],
-  //             hierarchy.ldws_h[1], dv, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  0, 0,
-  //             hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  0, hierarchy.dofs[1][l+1],
-  //             0), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  hierarchy.dofs[2][l+1], 0,
-  //             0), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  0, hierarchy.dofs[1][l+1],
-  //             hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  hierarchy.dofs[2][l+1], 0,
-  //             hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  hierarchy.dofs[2][l+1],
-  //             hierarchy.dofs[1][l+1], 0), ldvs_h[0], ldvs_h[1],
-  //             // null,ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  hierarchy.dofs[2][l+1],
-  //             hierarchy.dofs[1][l+1], hierarchy.dofs[0][l+1]), ldvs_h[0],
-  //             ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             0, 0, 0, hierarchy.dofs[2][l], hierarchy.dofs[1][l],
-  //             hierarchy.dofs[0][l], 0,
-  //             hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-  // print_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l],
-  // hierarchy.dofs[0][l], doutput.data(), doutput.ldvs_h[0], doutput.ldvs_h[1],
-  // doutput.ldvs_h[0],);
-
-  // gpk_rev<D, T, D, false, true, 1>(hierarchy,
-  //             shape, shape_c, hierarchy.ldws_h, ldvs_h, unprocessed_dims,
-  //             2, 1, 0,
-  //             hierarchy.ratio[2][l], hierarchy.ratio[1][l],
-  //             hierarchy.ratio[0][l], hierarchy.dw, hierarchy.ldws_h[0],
-  //             hierarchy.ldws_h[1], dv, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  0, 0,
-  //             hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  0, hierarchy.dofs[1][l+1],
-  //             0), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  hierarchy.dofs[2][l+1], 0,
-  //             0), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  0, hierarchy.dofs[1][l+1],
-  //             hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  hierarchy.dofs[2][l+1], 0,
-  //             hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  hierarchy.dofs[2][l+1],
-  //             hierarchy.dofs[1][l+1], 0), ldvs_h[0], ldvs_h[1],
-  //             // null,ldvs_h[0], ldvs_h[1],
-  //             dv+get_idx(ldvs_h[0], ldvs_h[1],  hierarchy.dofs[2][l+1],
-  //             hierarchy.dofs[1][l+1], hierarchy.dofs[0][l+1]), ldvs_h[0],
-  //             ldvs_h[1],
-  //             // null, ldvs_h[0], ldvs_h[1],
-  //             0, 0, 0, hierarchy.dofs[2][l], hierarchy.dofs[1][l],
-  //             hierarchy.dofs[0][l], 0,
-  //             hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-  if (debug_print) {
-    PrintSubarray("after coeff-restore", doutput);
-  }
-}
-
-template <DIM D, typename T, typename DeviceType>
-void calc_correction_3d(Hierarchy<D, T, DeviceType> &hierarchy,
-                        SubArray<D, T, DeviceType> dcoeff,
-                        SubArray<D, T, DeviceType> &dcorrection, SIZE l,
-                        int queue_idx) {
-
-  int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1);
-  int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1);
-
-  std::string prefix = "decomp_";
-  if (sizeof(T) == sizeof(double))
-    prefix += "d_";
-  if (sizeof(T) == sizeof(float))
-    prefix += "f_";
-  for (int d = 0; d < D; d++)
-    prefix += std::to_string(hierarchy.shape[d]) + "_";
-
-  SubArray<D, T, DeviceType> dw_in1, dw_in2, dw_out;
-
-  if (D >= 1) {
-    dw_in1 = dcoeff;
-    dw_in1.resize(
-        {hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l], hierarchy.dofs[2][l]});
-    dw_in2 = dcoeff;
-    dw_in2.offset({hierarchy.dofs[0][l + 1], 0, 0});
-    dw_in2.resize({hierarchy.dofs[0][l] - hierarchy.dofs[0][l + 1],
-                   hierarchy.dofs[1][l], hierarchy.dofs[2][l]});
-    dw_out = dcorrection;
-    dw_out.resize(
-        {hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l], hierarchy.dofs[2][l]});
-
-    Lpk1Reo3D<D, T, DeviceType>().Execute(
-        hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l],
-        hierarchy.dofs[0][l + 1], hierarchy.dofs[2][l + 1],
-        hierarchy.dofs[1][l + 1], hierarchy.dofs[0][l + 1],
-        SubArray(hierarchy.dist_array[0][l]),
-        SubArray(hierarchy.ratio_array[0][l]), dw_in1, dw_in2, dw_out,
-        queue_idx);
-
-    verify_matrix_cuda(
-        hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l + 1],
-        dw_out.data(), dw_out.getLd(0), dw_out.getLd(1), dw_out.getLd(0),
-        prefix + "lpk_reo_1_3d" + "_level_" + std::to_string(l), store, verify);
-
-    if (debug_print) {
-      PrintSubarray("after mass_trans_multiply_1_cpt", dw_out);
-    }
-  }
-
-  if (D >= 2) {
-    dw_in1 = dw_out;
-    dw_in1.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
-                   hierarchy.dofs[2][l]});
-    dw_in2 = dw_out;
-    dw_in2.offset({0, hierarchy.dofs[1][l + 1], 0});
-    dw_in2.resize({hierarchy.dofs[0][l + 1],
-                   hierarchy.dofs[1][l] - hierarchy.dofs[1][l + 1],
-                   hierarchy.dofs[2][l]});
-    dw_out.offset({hierarchy.dofs[0][l + 1], 0, 0});
-    dw_out.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
-                   hierarchy.dofs[2][l]});
-
-    Lpk2Reo3D<D, T, DeviceType>().Execute(
-        hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l + 1],
-        hierarchy.dofs[1][l + 1], SubArray(hierarchy.dist_array[1][l]),
-        SubArray(hierarchy.ratio_array[1][l]), dw_in1, dw_in2, dw_out,
-        queue_idx);
-
-    verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l + 1],
-                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
-                       dw_out.getLd(1), dw_out.getLd(0),
-                       prefix + "lpk_reo_2_3d" + "_level_" + std::to_string(l),
-                       store, verify);
-
-    if (debug_print) {
-      PrintSubarray("after mass_trans_multiply_2_cpt", dw_out);
-    }
-  }
-
-  if (D == 3) {
-    dw_in1 = dw_out;
-    dw_in1.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
-                   hierarchy.dofs[2][l + 1]});
-    dw_in2 = dw_out;
-    dw_in2.offset({0, 0, hierarchy.dofs[2][l + 1]});
-    dw_in2.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
-                   hierarchy.dofs[2][l] - hierarchy.dofs[2][l + 1]});
-    dw_out.offset({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], 0});
-    dw_out.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1],
-                   hierarchy.dofs[2][l + 1]});
-
-    Lpk3Reo3D<D, T, DeviceType>().Execute(
-        hierarchy.dofs[2][l], hierarchy.dofs[1][l + 1],
-        hierarchy.dofs[0][l + 1], hierarchy.dofs[2][l + 1],
-        SubArray(hierarchy.dist_array[2][l]),
-        SubArray(hierarchy.ratio_array[2][l]), dw_in1, dw_in2, dw_out,
-        queue_idx);
-
-    verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
-                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
-                       dw_out.getLd(1), dw_out.getLd(0),
-                       prefix + "lpk_reo_3_3d" + "_level_" + std::to_string(l),
-                       store, verify);
-
-    if (debug_print) {
-      PrintSubarray("after mass_trans_multiply_3_cpt", dw_out);
-    }
-  }
-
-  if (D >= 1) {
-    Ipk1Reo3D<D, T, DeviceType>().Execute(
-        hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
-        hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[0][l + 1]),
-        SubArray(hierarchy.bm_array[0][l + 1]),
-        SubArray(hierarchy.dist_array[0][l + 1]), dw_out, queue_idx);
-    verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
-                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
-                       dw_out.getLd(1), dw_out.getLd(0),
-                       prefix + "ipk_1_3d" + "_level_" + std::to_string(l),
-                       store, verify);
-
-    if (debug_print) {
-      PrintSubarray("after solve_tridiag_1_cpt", dw_out);
-    }
-  }
-  if (D >= 2) {
-    Ipk2Reo3D<D, T, DeviceType>().Execute(
-        hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
-        hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[1][l + 1]),
-        SubArray(hierarchy.bm_array[1][l + 1]),
-        SubArray(hierarchy.dist_array[1][l + 1]), dw_out, queue_idx);
-
-    verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
-                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
-                       dw_out.getLd(1), dw_out.getLd(0),
-                       prefix + "ipk_2_3d" + "_level_" + std::to_string(l),
-                       store, verify);
-
-    if (debug_print) {
-      PrintSubarray("after solve_tridiag_2_cpt", dw_out);
-    }
-  }
-  if (D == 3) {
-    Ipk3Reo3D<D, T, DeviceType>().Execute(
-        hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
-        hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[2][l + 1]),
-        SubArray(hierarchy.bm_array[2][l + 1]),
-        SubArray(hierarchy.dist_array[2][l + 1]), dw_out, queue_idx);
-
-    verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1],
-                       hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0),
-                       dw_out.getLd(1), dw_out.getLd(0),
-                       prefix + "ipk_3_3d" + "_level_" + std::to_string(l),
-                       store, verify);
-
-    if (debug_print) {
-      PrintSubarray("after solve_tridiag_3_cpt", dw_out);
-    }
-  }
-  // final correction output
-  dcorrection = dw_out;
-}
-
-template <DIM D, typename T, typename DeviceType>
-void calc_coefficients_nd(Hierarchy<D, T, DeviceType> &hierarchy,
-                          SubArray<D, T, DeviceType> dinput1,
-                          SubArray<D, T, DeviceType> dinput2,
-                          SubArray<D, T, DeviceType> &doutput, SIZE l,
-                          int queue_idx) {
-
-  int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1);
-  int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1);
-
-  std::string prefix = "decomp_";
-  if (sizeof(T) == sizeof(double))
-    prefix += "d_";
-  if (sizeof(T) == sizeof(float))
-    prefix += "f_";
-  for (int d = 0; d < D; d++)
-    prefix += std::to_string(hierarchy.shape[d]) + "_";
-  // printf("interpolate 1-3D\n");
-
-  SubArray<D, T, DeviceType> dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf,
-      dcoeff_rf, dcoeff_rc, dcoeff_rcf;
-
-  DIM curr_dims[3];
-
-  int unprocessed_idx = 0;
-  curr_dims[0] = 0;
-  curr_dims[1] = 1;
-  curr_dims[2] = 2;
-  dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-  doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-
-  calc_coeff_pointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f,
-                      dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                      dcoeff_rcf);
-
-  // gpuErrchk(cudaDeviceSynchronize());
-  GpkReo<D, 3, T, true, false, 1, DeviceType>().Execute(
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-      hierarchy.unprocessed_n[unprocessed_idx],
-      // unprocessed_dims_subarray,
-      SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
-      curr_dims[1], curr_dims[0],
-      SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-      SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-      SubArray(hierarchy.ratio_array[curr_dims[0]][l]),
-      // ratio_r, ratio_c, ratio_f,
-      dinput1, dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf,
-      dcoeff_rc, dcoeff_rcf, queue_idx);
-  // gpuErrchk(cudaDeviceSynchronize());
-
-  for (DIM d = 3; d < D; d += 2) {
-    // copy back to input1 for interpolation again
-    LwpkReo<D, T, COPY, DeviceType>().Execute(doutput, dinput1, queue_idx);
-
-    // printf("interpolate %u-%uD\n", d+1, d+2);
-    curr_dims[0] = 0;
-    curr_dims[1] = d;
-    curr_dims[2] = d + 1;
-    dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-    doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-    calc_coeff_pointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f,
-                        dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                        dcoeff_rcf);
-
-    if (D - d == 1) {
-      unprocessed_idx += 1;
-
-      GpkReo<D, 2, T, true, false, 2, DeviceType>().Execute(
-          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-          hierarchy.unprocessed_n[unprocessed_idx],
-          SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
-          curr_dims[1], curr_dims[0],
-          SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-          SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-          SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
-          dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-          dcoeff_rcf, queue_idx);
-
-    } else { // D - d >= 2
-      unprocessed_idx += 2;
-      GpkReo<D, 3, T, true, false, 2, DeviceType>().Execute(
-          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-          hierarchy.unprocessed_n[unprocessed_idx],
-          SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-          // unprocessed_dims_subarray,
-          curr_dims[2], curr_dims[1], curr_dims[0],
-          // ratio_r, ratio_c, ratio_f,
-          SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-          SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-          SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
-          dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-          dcoeff_rcf, queue_idx);
-    }
-  }
-
-  if (debug_print) { // debug
-    PrintSubarray4D("after interpolation", doutput);
-  } // debug
-
-  unprocessed_idx = 0;
-  // printf("reorder 1-3D\n");
-  curr_dims[0] = 0;
-  curr_dims[1] = 1;
-  curr_dims[2] = 2;
-  dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-  dinput1.project(curr_dims[0], curr_dims[1],
-                  curr_dims[2]); // reuse input1 as temp output
-
-  calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f,
-                      dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                      dcoeff_rcf);
-
-  GpkReo<D, 3, T, false, false, 1, DeviceType>().Execute(
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-      hierarchy.unprocessed_n[unprocessed_idx],
-      SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-      // unprocessed_dims_subarray,
-      curr_dims[2], curr_dims[1], curr_dims[0],
-      // ratio_r, ratio_c, ratio_f,
-      SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-      SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-      SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput2, dcoarse,
-      dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf,
-      queue_idx);
-
-  DIM D_reduced = D % 2 == 0 ? D - 1 : D - 2;
-  for (DIM d = 3; d < D_reduced; d += 2) {
-    // copy back to input2 for reordering again
-
-    LwpkReo<D, T, COPY, DeviceType>().Execute(dinput1, dinput2, queue_idx);
-
-    unprocessed_idx += 2;
-    // printf("reorder %u-%uD\n", d+1, d+2);
-    curr_dims[0] = 0;
-    curr_dims[1] = d;
-    curr_dims[2] = d + 1;
-    dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-    dinput1.project(curr_dims[0], curr_dims[1],
-                    curr_dims[2]); // reuse input1 as temp output
-
-    calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f,
-                        dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                        dcoeff_rcf);
-
-    GpkReo<D, 3, T, false, false, 2, DeviceType>().Execute(
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-        hierarchy.unprocessed_n[unprocessed_idx],
-        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-        // unprocessed_dims_subarray,
-        curr_dims[2], curr_dims[1], curr_dims[0],
-        // ratio_r, ratio_c, ratio_f,
-        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput2, dcoarse,
-        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-        dcoeff_rcf, queue_idx);
-    // gpuErrchk(cudaDeviceSynchronize());
-  }
-
-  // printf("calc coeff %u-%dD\n", D_reduced+1, D_reduced+2);
-  curr_dims[0] = 0;
-  curr_dims[1] = D_reduced;
-  curr_dims[2] = D_reduced + 1;
-  dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-  doutput.project(curr_dims[0], curr_dims[1],
-                  curr_dims[2]); // reuse input1 as temp output
-  calc_coeff_pointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f,
-                      dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                      dcoeff_rcf);
-  if (D - D_reduced == 1) {
-    unprocessed_idx += 1;
-    GpkReo<D, 2, T, false, true, 2, DeviceType>().Execute(
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-        hierarchy.unprocessed_n[unprocessed_idx],
-        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-        // unprocessed_dims_subarray,
-        curr_dims[2], curr_dims[1], curr_dims[0],
-        // ratio_r, ratio_c, ratio_f,
-        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
-        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-        dcoeff_rcf, queue_idx);
-    // gpuErrchk(cudaDeviceSynchronize());
-
-  } else { // D-D_reduced == 2
-    unprocessed_idx += 2;
-
-    GpkReo<D, 3, T, false, true, 2, DeviceType>().Execute(
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-        hierarchy.unprocessed_n[unprocessed_idx],
-        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2],
-        curr_dims[1], curr_dims[0],
-        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
-        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-        dcoeff_rcf, queue_idx);
-  }
-
-  if (debug_print) { // debug
-    PrintSubarray4D("after calc coeff", doutput);
-  } // debug
-}
-
-template <DIM D, typename T, typename DeviceType>
-void coefficients_restore_nd(Hierarchy<D, T, DeviceType> &hierarchy,
-                             SubArray<D, T, DeviceType> dinput1,
-                             SubArray<D, T, DeviceType> dinput2,
-                             SubArray<D, T, DeviceType> &doutput, SIZE l,
-                             int queue_idx) {
-
-  int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1);
-  int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1);
-
-  std::string prefix = "decomp_";
-  if (sizeof(T) == sizeof(double))
-    prefix += "d_";
-  if (sizeof(T) == sizeof(float))
-    prefix += "f_";
-  for (int d = 0; d < D; d++)
-    prefix += std::to_string(hierarchy.shape[d]) + "_";
-
-  SubArray<D, T, DeviceType> dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf,
-      dcoeff_rf, dcoeff_rc, dcoeff_rcf;
-
-  DIM curr_dims[3];
-  int unprocessed_idx = 0;
-
-  // printf("interpolate-restore 1-3D\n");
-  curr_dims[0] = 0;
-  curr_dims[1] = 1;
-  curr_dims[2] = 2;
-  dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-  doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-
-  calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f,
-                      dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                      dcoeff_rcf);
-
-  // gpk_rev<D, 3, T, true, false, 1>(
-  //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l],
-  //     hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(),
-  //     hierarchy.unprocessed_n[unprocessed_idx],
-  //     hierarchy.unprocessed_dims_d[unprocessed_idx],
-  //     curr_dims[2], curr_dims[1], curr_dims[0],
-  //     hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l],
-  //     hierarchy.ratio[curr_dims[0]][l], doutput.data(), doutput.getLddv1(),
-  //     doutput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(),
-  //     dcoarse.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     0, 0, 0,
-  //     hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
-  //     hierarchy.dofs[curr_dims[0]][l], queue_idx,
-  //     hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-  // gpuErrchk(cudaDeviceSynchronize());
-  GpkRev<D, 3, T, true, false, 1, DeviceType>().Execute(
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-      hierarchy.unprocessed_n[unprocessed_idx],
-      SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-      // unprocessed_dims_subarray,
-      curr_dims[2], curr_dims[1], curr_dims[0],
-      // ratio_r, ratio_c, ratio_f,
-      SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-      SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-      SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
-      dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf,
-      0, 0, 0, hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
-      hierarchy.dofs[curr_dims[0]][l], queue_idx);
-  // gpuErrchk(cudaDeviceSynchronize());
-
-  for (DIM d = 3; d < D; d += 2) {
-    // lwpk<D, T, COPY>(hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l],
-    //                  doutput.data(), doutput.getLdd(),
-    //                  dinput1.data(), dinput1.getLdd(), queue_idx);
-
-    // gpuErrchk(cudaDeviceSynchronize());
-    LwpkReo<D, T, COPY, DeviceType>().Execute(doutput, dinput1, queue_idx);
-    // gpuErrchk(cudaDeviceSynchronize());
-
-    // printf("interpolate-restore %u-%uD\n", d+1, d+2);
-    curr_dims[0] = 0;
-    curr_dims[1] = d;
-    curr_dims[2] = d + 1;
-    dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-    doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-    calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f,
-                        dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                        dcoeff_rcf);
-
-    if (D - d == 1) {
-      unprocessed_idx += 1;
-      // unprocessed_dims_subarray = SubArray<1, DIM,
-      // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]},
-      //                                                     hierarchy.unprocessed_dims_d[unprocessed_idx]);
-      // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]},
-      // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T,
-      // DeviceType>({hierarchy.dofs[curr_dims[1]][l]},
-      // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T,
-      // DeviceType>({hierarchy.dofs[curr_dims[0]][l]},
-      // hierarchy.ratio[curr_dims[0]][l]);
-
-      // gpk_rev<D, 2, T, true, false, 2>(
-      //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l],
-      //     hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(),
-      //     hierarchy.unprocessed_n[unprocessed_idx],
-      //     hierarchy.unprocessed_dims_d[unprocessed_idx],
-      //     curr_dims[2], curr_dims[1], curr_dims[0],
-      //     hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l],
-      //     hierarchy.ratio[curr_dims[0]][l], doutput.data(),
-      //     doutput.getLddv1(), doutput.getLddv2(), dcoarse.data(),
-      //     dcoarse.getLddv1(), dcoarse.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     0, 0, 0,
-      //     hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
-      //     hierarchy.dofs[curr_dims[0]][l], queue_idx,
-      //     hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-      // gpuErrchk(cudaDeviceSynchronize());
-      GpkRev<D, 2, T, true, false, 2, DeviceType>().Execute(
-          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-          hierarchy.unprocessed_n[unprocessed_idx],
-          SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-          // unprocessed_dims_subarray,
-          curr_dims[2], curr_dims[1], curr_dims[0],
-          // ratio_r, ratio_c, ratio_f,
-          SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-          SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-          SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
-          dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-          dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
-          hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
-          queue_idx);
-      // gpuErrchk(cudaDeviceSynchronize());
-
-    } else { // D - d >= 2
-      unprocessed_idx += 2;
-      // unprocessed_dims_subarray = SubArray<1, DIM,
-      // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]},
-      //                                                     hierarchy.unprocessed_dims_d[unprocessed_idx]);
-      // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]},
-      // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T,
-      // DeviceType>({hierarchy.dofs[curr_dims[1]][l]},
-      // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T,
-      // DeviceType>({hierarchy.dofs[curr_dims[0]][l]},
-      // hierarchy.ratio[curr_dims[0]][l]);
-
-      // gpk_rev<D, 3, T, true, false, 2>(
-      //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l],
-      //     hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(),
-      //     hierarchy.unprocessed_n[unprocessed_idx],
-      //     hierarchy.unprocessed_dims_d[unprocessed_idx],
-      //     curr_dims[2], curr_dims[1], curr_dims[0],
-      //     hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l],
-      //     hierarchy.ratio[curr_dims[0]][l], doutput.data(),
-      //     doutput.getLddv1(), doutput.getLddv2(), dcoarse.data(),
-      //     dcoarse.getLddv1(), dcoarse.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(),
-      //     // null, lddv1, lddv2,
-      //     0, 0, 0,
-      //     hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
-      //     hierarchy.dofs[curr_dims[0]][l], queue_idx,
-      //     hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-      // gpuErrchk(cudaDeviceSynchronize());
-      GpkRev<D, 3, T, true, false, 2, DeviceType>().Execute(
-          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-          SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-          hierarchy.unprocessed_n[unprocessed_idx],
-          SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-          // unprocessed_dims_subarray,
-          curr_dims[2], curr_dims[1], curr_dims[0],
-          // ratio_r, ratio_c, ratio_f,
-          SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-          SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-          SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
-          dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-          dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
-          hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
-          queue_idx);
-      // gpuErrchk(cudaDeviceSynchronize());
-    }
-  }
-  // Done interpolation-restore on doutput
-
-  if (debug_print) { // debug
-    PrintSubarray4D("After interpolation reverse-reorder", doutput);
-  } // debug
-
-  unprocessed_idx = 0;
-
-  // printf("reorder-restore 1-3D\n");
-  curr_dims[0] = 0;
-  curr_dims[1] = 1;
-  curr_dims[2] = 2;
-  dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-  dinput1.project(curr_dims[0], curr_dims[1],
-                  curr_dims[2]); // reuse input1 as temp space
-
-  // unprocessed_dims_subarray = SubArray<1, DIM,
-  // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]},
-  //                                                         hierarchy.unprocessed_dims_d[unprocessed_idx]);
-  // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]},
-  // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T,
-  // DeviceType>({hierarchy.dofs[curr_dims[1]][l]},
-  // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T,
-  // DeviceType>({hierarchy.dofs[curr_dims[0]][l]},
-  // hierarchy.ratio[curr_dims[0]][l]);
-
-  calc_coeff_pointers(hierarchy, curr_dims, l, dinput2, dcoarse, dcoeff_f,
-                      dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                      dcoeff_rcf);
-
-  // gpk_rev<D, 3, T, false, false, 1>(
-  //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l],
-  //     hierarchy.shapes_d[l + 1], dinput1.getLdd(), dinput2.getLdd(),
-  //     hierarchy.unprocessed_n[unprocessed_idx],
-  //     hierarchy.unprocessed_dims_d[unprocessed_idx],
-  //     curr_dims[2], curr_dims[1], curr_dims[0],
-  //     hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l],
-  //     hierarchy.ratio[curr_dims[0]][l], dinput1.data(), dinput1.getLddv1(),
-  //     dinput1.getLddv2(), dcoarse.data(), dcoarse.getLddv1(),
-  //     dcoarse.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(),
-  //     // null, lddv1, lddv2,
-  //     0, 0, 0,
-  //     hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
-  //     hierarchy.dofs[curr_dims[0]][l], queue_idx,
-  //     hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-  // gpuErrchk(cudaDeviceSynchronize());
-  GpkRev<D, 3, T, false, false, 1, DeviceType>().Execute(
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-      hierarchy.unprocessed_n[unprocessed_idx],
-      SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-      // unprocessed_dims_subarray,
-      curr_dims[2], curr_dims[1], curr_dims[0],
-      // ratio_r, ratio_c, ratio_f,
-      SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-      SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-      SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
-      dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf,
-      0, 0, 0, hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
-      hierarchy.dofs[curr_dims[0]][l], queue_idx);
-  // gpuErrchk(cudaDeviceSynchronize());
-
-  DIM D_reduced = D % 2 == 0 ? D - 1 : D - 2;
-  for (DIM d = 3; d < D_reduced; d += 2) {
-    // printf("reorder-reverse\n");
-    // copy back to input2 for reordering again
-    // lwpk<D, T, COPY>(hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l],
-    //                dinput1.data(), dinput1.getLdd(), dinput2.data(),
-    //                dinput2.getLdd(), queue_idx);
-
-    // gpuErrchk(cudaDeviceSynchronize());
-    LwpkReo<D, T, COPY, DeviceType>().Execute(dinput1, dinput2, queue_idx);
-    // gpuErrchk(cudaDeviceSynchronize());
-
-    // printf("reorder-restore %u-%uD\n", d+1, d+2);
-    curr_dims[0] = 0;
-    curr_dims[1] = d;
-    curr_dims[2] = d + 1;
-    dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-    dinput1.project(curr_dims[0], curr_dims[1],
-                    curr_dims[2]); // reuse input1 as temp output
-
-    calc_coeff_pointers(hierarchy, curr_dims, l, dinput2, dcoarse, dcoeff_f,
-                        dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                        dcoeff_rcf);
-
-    unprocessed_idx += 2;
-
-    // unprocessed_dims_subarray = SubArray<1, DIM,
-    // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]},
-    //                                                       hierarchy.unprocessed_dims_d[unprocessed_idx]);
-    // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]},
-    // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T,
-    // DeviceType>({hierarchy.dofs[curr_dims[1]][l]},
-    // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T,
-    // DeviceType>({hierarchy.dofs[curr_dims[0]][l]},
-    // hierarchy.ratio[curr_dims[0]][l]);
-
-    // gpk_rev<D, 3, T, false, false, 2>(
-    //   hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l],
-    //   hierarchy.shapes_d[l + 1], dinput1.getLdd(), dinput2.getLdd(),
-    //   hierarchy.unprocessed_n[unprocessed_idx],
-    //   hierarchy.unprocessed_dims_d[unprocessed_idx],
-    //   curr_dims[2], curr_dims[1], curr_dims[0],
-    //   hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l],
-    //   hierarchy.ratio[curr_dims[0]][l], dinput1.data(), dinput1.getLddv1(),
-    //   dinput1.getLddv2(), dcoarse.data(), dcoarse.getLddv1(),
-    //   dcoarse.getLddv2(),
-    //   // null, lddv1, lddv2,
-    //   dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(),
-    //   // null, lddv1, lddv2,
-    //   dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(),
-    //   // null, lddv1, lddv2,
-    //   dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(),
-    //   // null, lddv1, lddv2,
-    //   dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(),
-    //   // null, lddv1, lddv2,
-    //   dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(),
-    //   // null, lddv1, lddv2,
-    //   dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(),
-    //   // null, lddv1, lddv2,
-    //   dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(),
-    //   // null, lddv1, lddv2,
-    //   0, 0, 0,
-    //   hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
-    //   hierarchy.dofs[curr_dims[0]][l], queue_idx,
-    //   hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-    // gpuErrchk(cudaDeviceSynchronize());
-    GpkRev<D, 3, T, false, false, 2, DeviceType>().Execute(
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-        hierarchy.unprocessed_n[unprocessed_idx],
-        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-        // unprocessed_dims_subarray,
-        curr_dims[2], curr_dims[1], curr_dims[0],
-        // ratio_r, ratio_c, ratio_f,
-        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse,
-        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-        dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
-        hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
-        queue_idx);
-    // gpuErrchk(cudaDeviceSynchronize());
-  }
-
-  // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+2);
-  curr_dims[0] = 0;
-  curr_dims[1] = D_reduced;
-  curr_dims[2] = D_reduced + 1;
-  dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-  doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-  calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f,
-                      dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                      dcoeff_rcf);
-
-  if (D - D_reduced == 1) {
-    // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+1);
-    unprocessed_idx += 1;
-
-    // unprocessed_dims_subarray = SubArray<1, DIM,
-    // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]},
-    //                                                       hierarchy.unprocessed_dims_d[unprocessed_idx]);
-    // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]},
-    // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T,
-    // DeviceType>({hierarchy.dofs[curr_dims[1]][l]},
-    // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T,
-    // DeviceType>({hierarchy.dofs[curr_dims[0]][l]},
-    // hierarchy.ratio[curr_dims[0]][l]);
-
-    // gpk_rev<D, 2, T, false, true, 2>(
-    //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l],
-    //     hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(),
-    //     hierarchy.unprocessed_n[unprocessed_idx],
-    //     hierarchy.unprocessed_dims_d[unprocessed_idx],
-    //     curr_dims[2], curr_dims[1], curr_dims[0],
-    //     hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l],
-    //     hierarchy.ratio[curr_dims[0]][l], doutput.data(), doutput.getLddv1(),
-    //     doutput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(),
-    //     dcoarse.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     0, 0, 0,
-    //     hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
-    //     hierarchy.dofs[curr_dims[0]][l], queue_idx,
-    //     hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-    // gpuErrchk(cudaDeviceSynchronize());
-    GpkRev<D, 2, T, false, true, 2, DeviceType>().Execute(
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-        hierarchy.unprocessed_n[unprocessed_idx],
-        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-        // unprocessed_dims_subarray,
-        curr_dims[2], curr_dims[1], curr_dims[0],
-        // ratio_r, ratio_c, ratio_f,
-        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
-        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-        dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
-        hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
-        queue_idx);
-    // gpuErrchk(cudaDeviceSynchronize());
-  } else { // D - D_reduced >= 2
-    // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+2);
-    unprocessed_idx += 2;
-
-    // unprocessed_dims_subarray = SubArray<1, DIM,
-    // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]},
-    //                                                       hierarchy.unprocessed_dims_d[unprocessed_idx]);
-    // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]},
-    // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T,
-    // DeviceType>({hierarchy.dofs[curr_dims[1]][l]},
-    // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T,
-    // DeviceType>({hierarchy.dofs[curr_dims[0]][l]},
-    // hierarchy.ratio[curr_dims[0]][l]);
-
-    // gpk_rev<D, 3, T, false, true, 2>(
-    //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l],
-    //     hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(),
-    //     hierarchy.unprocessed_n[unprocessed_idx],
-    //     hierarchy.unprocessed_dims_d[unprocessed_idx],
-    //     curr_dims[2], curr_dims[1], curr_dims[0],
-    //     hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l],
-    //     hierarchy.ratio[curr_dims[0]][l], doutput.data(), doutput.getLddv1(),
-    //     doutput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(),
-    //     dcoarse.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(),
-    //     // null, lddv1, lddv2,
-    //     0, 0, 0,
-    //     hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l],
-    //     hierarchy.dofs[curr_dims[0]][l], queue_idx,
-    //     hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]);
-
-    // gpuErrchk(cudaDeviceSynchronize());
-    GpkRev<D, 3, T, false, true, 2, DeviceType>().Execute(
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-        hierarchy.unprocessed_n[unprocessed_idx],
-        SubArray(hierarchy.unprocessed_dims[unprocessed_idx]),
-        // unprocessed_dims_subarray,
-        curr_dims[2], curr_dims[1], curr_dims[0],
-        // ratio_r, ratio_c, ratio_f,
-        SubArray(hierarchy.ratio_array[curr_dims[2]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[1]][l]),
-        SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse,
-        dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-        dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l],
-        hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l],
-        queue_idx);
-    // gpuErrchk(cudaDeviceSynchronize());
-  }
-
-  if (debug_print) { // debug
-    PrintSubarray4D("After coeff restore", doutput);
-  } // debug
-}
-
-template <DIM D, typename T, typename DeviceType>
-void calc_correction_nd(Hierarchy<D, T, DeviceType> &hierarchy,
-                        SubArray<D, T, DeviceType> dcoeff,
-                        SubArray<D, T, DeviceType> &dcorrection, SIZE l,
-                        int queue_idx) {
-  int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1);
-  int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1);
-
-  std::string prefix = "decomp_";
-  if (sizeof(T) == sizeof(double))
-    prefix += "d_";
-  if (sizeof(T) == sizeof(float))
-    prefix += "f_";
-  for (int d = 0; d < D; d++)
-    prefix += std::to_string(hierarchy.shape[d]) + "_";
-
-  SubArray<D, T, DeviceType> dw_in1 = dcoeff;
-  SubArray<D, T, DeviceType> dw_in2 = dcoeff;
-  SubArray<D, T, DeviceType> dw_out = dcorrection;
-
-  // start correction calculation
-  int prev_dim_r, prev_dim_c, prev_dim_f;
-  int curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
-
-  dw_in1.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]);
-  dw_in2.offset(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]);
-  dw_in2.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l] -
-                                hierarchy.dofs[curr_dim_f][l + 1]);
-  dw_out.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]);
-
-  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
-
-  // printf("mass trans 1D\n");
-  // lpk_reo_1<D, T>(
-  //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1],
-  //     hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1], dw_in1.getLdd(),
-  //     dw_out.getLdd(), hierarchy.processed_n[0],
-  //     hierarchy.processed_dims_h[0], hierarchy.processed_dims_d[0],
-  //     curr_dim_r, curr_dim_c, curr_dim_f, hierarchy.dist[curr_dim_f][l],
-  //     hierarchy.ratio[curr_dim_f][l], dw_in1.data(), dw_in1.getLddv1(),
-  //     dw_in1.getLddv2(), dw_in2.data(), dw_in2.getLddv1(), dw_in2.getLddv2(),
-  //     dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx,
-  //     hierarchy.auto_tuning_mr1[hierarchy.arch][hierarchy.precision][range_lp1]);
-
-  // SubArray<1, T, DeviceType> dist_f = SubArray<1, T,
-  // DeviceType>({hierarchy.dofs[curr_dim_f][l]},
-  // hierarchy.dist[curr_dim_f][l]); SubArray<1, T, DeviceType> ratio_f =
-  // SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dim_f][l]},
-  // hierarchy.ratio[curr_dim_f][l]); gpuErrchk(cudaDeviceSynchronize());
-  Lpk1Reo<D, T, DeviceType>().Execute(
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-      hierarchy.processed_n[0],
-      SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[0], true),
-      curr_dim_r, curr_dim_c, curr_dim_f,
-      // dist_f, ratio_f,
-      SubArray(hierarchy.dist_array[curr_dim_f][l]),
-      SubArray(hierarchy.ratio_array[curr_dim_f][l]), dw_in1, dw_in2, dw_out,
-      0);
-  // gpuErrchk(cudaDeviceSynchronize());
-
-  if (debug_print) { // debug
-    PrintSubarray4D(format("decomposition: after MR-1D[{}]", l), dw_out);
-  }
-
-  // mass trans 2D
-  prev_dim_f = curr_dim_f;
-  prev_dim_c = curr_dim_c;
-  prev_dim_r = curr_dim_r;
-  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
-
-  dw_in1 = dw_out;
-  dw_in2 = dw_out;
-  dw_in1.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]);
-  dw_in2.offset(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]);
-  dw_in2.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l] -
-                                hierarchy.dofs[curr_dim_c][l + 1]);
-  dw_out.offset(prev_dim_f, hierarchy.dofs[curr_dim_f][l + 1]);
-  dw_out.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]);
-
-  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
-
-  // printf("mass trans 2D\n");
-  // lpk_reo_2<D, T>(
-  //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1],
-  //     hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1],
-  //     dw_in1.getLdd(), dw_out.getLdd(),
-  //     hierarchy.processed_n[1], hierarchy.processed_dims_h[1],
-  //     hierarchy.processed_dims_d[1],
-  //     curr_dim_r, curr_dim_c, curr_dim_f,
-  //     hierarchy.dist[curr_dim_c][l], hierarchy.ratio[curr_dim_c][l],
-  //     dw_in1.data(), dw_in1.getLddv1(), dw_in1.getLddv2(),
-  //     dw_in2.data(), dw_in2.getLddv1(), dw_in2.getLddv2(),
-  //     dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx,
-  //     hierarchy.auto_tuning_mr1[hierarchy.arch][hierarchy.precision][range_lp1]);
-
-  // gpuErrchk(cudaDeviceSynchronize());
-  Lpk2Reo<D, T, DeviceType>().Execute(
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-      hierarchy.processed_n[1],
-      SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[1], true),
-      curr_dim_r, curr_dim_c, curr_dim_f,
-      // dist_f, ratio_f,
-      SubArray(hierarchy.dist_array[curr_dim_c][l]),
-      SubArray(hierarchy.ratio_array[curr_dim_c][l]), dw_in1, dw_in2, dw_out,
-      0);
-  // gpuErrchk(cudaDeviceSynchronize());
-
-  if (debug_print) { // debug
-    PrintSubarray4D(format("decomposition: after MR-2D[{}]", l), dw_out);
-  }
-
-  // mass trans 3D
-
-  prev_dim_f = curr_dim_f;
-  prev_dim_c = curr_dim_c;
-  prev_dim_r = curr_dim_r;
-  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
-
-  dw_in1 = dw_out;
-  dw_in2 = dw_out;
-  dw_in1.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
-  dw_in2.offset(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
-  dw_in2.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l] -
-                                hierarchy.dofs[curr_dim_r][l + 1]);
-  dw_out.offset(prev_dim_c, hierarchy.dofs[curr_dim_c][l + 1]);
-  dw_out.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
-
-  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
-
-  // printf("mass trans 3D\n");
-  // lpk_reo_3<D, T>(
-  //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1],
-  //     hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1],
-  //     dw_in1.getLdd(), dw_out.getLdd(),
-  //     hierarchy.processed_n[2], hierarchy.processed_dims_h[2],
-  //     hierarchy.processed_dims_d[2],
-  //     curr_dim_r, curr_dim_c, curr_dim_f,
-  //     hierarchy.dist[curr_dim_r][l], hierarchy.ratio[curr_dim_r][l],
-  //     dw_in1.data(), dw_in1.getLddv1(), dw_in1.getLddv2(),
-  //     dw_in2.data(), dw_in2.getLddv1(), dw_in2.getLddv2(),
-  //     dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx,
-  //     hierarchy.auto_tuning_mr1[hierarchy.arch][hierarchy.precision][range_lp1]);
-
-  // gpuErrchk(cudaDeviceSynchronize());
-  Lpk3Reo<D, T, DeviceType>().Execute(
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-      SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-      hierarchy.processed_n[2],
-      SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[2], true),
-      curr_dim_r, curr_dim_c, curr_dim_f,
-      // dist_f, ratio_f,
-      SubArray(hierarchy.dist_array[curr_dim_r][l]),
-      SubArray(hierarchy.ratio_array[curr_dim_r][l]), dw_in1, dw_in2, dw_out,
-      0);
-  // gpuErrchk(cudaDeviceSynchronize());
-
-  if (debug_print) { // debug
-    PrintSubarray4D(format("decomposition: after MR-3D[{}]", l), dw_out);
-  }
-
-  // mass trans 4D+
-  for (int i = 3; i < D; i++) {
-    prev_dim_f = curr_dim_f;
-    prev_dim_c = curr_dim_c;
-    prev_dim_r = curr_dim_r;
-    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = i;
-    dw_in1 = dw_out;
-    dw_in2 = dw_out;
-    dw_in1.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
-    dw_in2.offset(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
-    dw_in2.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l] -
-                                  hierarchy.dofs[curr_dim_r][l + 1]);
-    dw_out.offset(prev_dim_r, hierarchy.dofs[prev_dim_r][l + 1]);
-    dw_out.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]);
-
-    dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
-    dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
-    dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
-
-    // printf("mass trans %dD\n", i+1);
-    // lpk_reo_3<D, T>(
-    //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1],
-    //     hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1],
-    //     dw_in1.getLdd(), dw_out.getLdd(),
-    //     hierarchy.processed_n[i], hierarchy.processed_dims_h[i],
-    //     hierarchy.processed_dims_d[i],
-    //     curr_dim_r, curr_dim_c, curr_dim_f,
-    //     hierarchy.dist[curr_dim_r][l], hierarchy.ratio[curr_dim_r][l],
-    //     dw_in1.data(), dw_in1.getLddv1(), dw_in1.getLddv2(),
-    //     dw_in2.data(), dw_in2.getLddv1(), dw_in2.getLddv2(),
-    //     dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx,
-    //     hierarchy.auto_tuning_mr1[hierarchy.arch][hierarchy.precision][range_lp1]);
-
-    // gpuErrchk(cudaDeviceSynchronize());
-    Lpk3Reo<D, T, DeviceType>().Execute(
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true),
-        SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true),
-        hierarchy.processed_n[i],
-        SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[i], true),
-        curr_dim_r, curr_dim_c, curr_dim_f,
-        // dist_f, ratio_f,
-        SubArray(hierarchy.dist_array[curr_dim_r][l]),
-        SubArray(hierarchy.ratio_array[curr_dim_r][l]), dw_in1, dw_in2, dw_out,
-        0);
-    // gpuErrchk(cudaDeviceSynchronize());
-
-    if (debug_print) { // debug
-      PrintSubarray4D(format("decomposition: after MR-{}D[{}]", i + 1, l),
-                      dw_out);
-    }
-  }
-
-  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
-  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
-
-  // printf("solve tridiag 1D\n");
-  // ipk_1<D, T>(
-  //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1],
-  //     hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1],
-  //     dw_out.getLdd(), dw_out.getLdd(),
-  //     hierarchy.processed_n[0], hierarchy.processed_dims_h[0],
-  //     hierarchy.processed_dims_d[0],
-  //     curr_dim_r, curr_dim_c, curr_dim_f,
-  //     hierarchy.am[curr_dim_f][l + 1], hierarchy.bm[curr_dim_f][l + 1],
-  //     hierarchy.dist[curr_dim_f][l + 1],
-  //     dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx,
-  //     hierarchy.auto_tuning_ts1[hierarchy.arch][hierarchy.precision][range_lp1]);
-
-  // gpuErrchk(cudaDeviceSynchronize());
-  Ipk1Reo<D, T, DeviceType>().Execute(
-      curr_dim_r, curr_dim_c, curr_dim_f,
-      SubArray(hierarchy.am_array[curr_dim_f][l + 1]),
-      SubArray(hierarchy.bm_array[curr_dim_f][l + 1]), dw_out, 0);
-  // gpuErrchk(cudaDeviceSynchronize());
-
-  if (debug_print) { // debug
-    PrintSubarray4D(format("decomposition: after TR-1D[{}]", l), dw_out);
-  } // debug
-
-  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
-  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
-
-  // printf("solve tridiag 2D\n");
-  // ipk_2<D, T>(
-  //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1],
-  //     hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1],
-  //     dw_out.getLdd(), dw_out.getLdd(),
-  //     hierarchy.processed_n[1], hierarchy.processed_dims_h[1],
-  //     hierarchy.processed_dims_d[1],
-  //     curr_dim_r, curr_dim_c, curr_dim_f,
-  //     hierarchy.am[curr_dim_c][l + 1], hierarchy.bm[curr_dim_c][l + 1],
-  //     hierarchy.dist[curr_dim_c][l + 1],
-  //     dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx,
-  //     hierarchy.auto_tuning_ts1[hierarchy.arch][hierarchy.precision][range_lp1]);
-
-  // gpuErrchk(cudaDeviceSynchronize());
-  Ipk2Reo<D, T, DeviceType>().Execute(
-      curr_dim_r, curr_dim_c, curr_dim_f,
-      SubArray(hierarchy.am_array[curr_dim_c][l + 1]),
-      SubArray(hierarchy.bm_array[curr_dim_c][l + 1]),
-      // SubArray(hierarchy.dist_array[curr_dim_f][l+1]),
-      dw_out, 0);
-  // gpuErrchk(cudaDeviceSynchronize());
-
-  if (debug_print) { // debug
-    PrintSubarray4D(format("decomposition: after TR-2D[{}]", l), dw_out);
-  } // debug
-
-  curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
-  dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
-  dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
-
-  // printf("solve tridiag 3D\n");
-  // ipk_3<D, T>(
-  //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1],
-  //     hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1],
-  //     dw_out.getLdd(), dw_out.getLdd(),
-  //     hierarchy.processed_n[2], hierarchy.processed_dims_h[2],
-  //     hierarchy.processed_dims_d[2], curr_dim_r, curr_dim_c, curr_dim_f,
-  //     hierarchy.am[curr_dim_r][l + 1], hierarchy.bm[curr_dim_r][l + 1],
-  //     hierarchy.dist[curr_dim_r][l + 1],
-  //     dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx,
-  //     hierarchy.auto_tuning_ts1[hierarchy.arch][hierarchy.precision][range_lp1]);
-
-  // gpuErrchk(cudaDeviceSynchronize());
-  Ipk3Reo<D, T, DeviceType>().Execute(
-      curr_dim_r, curr_dim_c, curr_dim_f,
-      SubArray(hierarchy.am_array[curr_dim_r][l + 1]),
-      SubArray(hierarchy.bm_array[curr_dim_r][l + 1]),
-      // SubArray(hierarchy.dist_array[curr_dim_f][l+1]),
-      dw_out, 0);
-  // gpuErrchk(cudaDeviceSynchronize());
-
-  if (debug_print) { // debug
-    PrintSubarray4D(format("decomposition: after TR-3D[{}]", l), dw_out);
-  } // debug
-
-  // mass trans 4D+
-  for (int i = 3; i < D; i++) {
-    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = i;
-    dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r);
-    dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r);
-    dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r);
-    // printf("solve tridiag %dD\n", i+1);
-    // ipk_3<D, T>(
-    //     hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1],
-    //     hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1],
-    //     dw_out.getLdd(), dw_out.getLdd(),
-    //     hierarchy.processed_n[i], hierarchy.processed_dims_h[i],
-    //     hierarchy.processed_dims_d[i], curr_dim_r, curr_dim_c, curr_dim_f,
-    //     hierarchy.am[curr_dim_r][l + 1], hierarchy.bm[curr_dim_r][l + 1],
-    //     hierarchy.dist[curr_dim_r][l + 1],
-    //     dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx,
-    //     hierarchy.auto_tuning_ts1[hierarchy.arch][hierarchy.precision][range_lp1]);
-
-    // gpuErrchk(cudaDeviceSynchronize());
-    Ipk3Reo<D, T, DeviceType>().Execute(
-        curr_dim_r, curr_dim_c, curr_dim_f,
-        SubArray(hierarchy.am_array[curr_dim_r][l + 1]),
-        SubArray(hierarchy.bm_array[curr_dim_r][l + 1]),
-        // SubArray(hierarchy.dist_array[curr_dim_f][l+1]),
-        dw_out, 0);
-    // gpuErrchk(cudaDeviceSynchronize());
-    if (debug_print) { // debug
-      PrintSubarray4D(format("decomposition: after TR-{}D[{}]", i + 1, l),
-                      dw_out);
-    } // debug
-  }
-
-  dcorrection = dw_out;
-}
-
 template <DIM D, typename T, typename DeviceType>
 void decompose(Hierarchy<D, T, DeviceType> &hierarchy,
                SubArray<D, T, DeviceType> &v, SIZE l_target, int queue_idx) {
@@ -1710,85 +41,71 @@ void decompose(Hierarchy<D, T, DeviceType> &hierarchy,
   SubArray<D, T, DeviceType> w_correction = w;
   SubArray<D, T, DeviceType> v_coarse = v;
 
-  if (D <= 3) {
+  if constexpr (D <= 3) {
     for (int l = 0; l < l_target; ++l) {
-      if (debug_print) {
+      if (multidim_refactoring_debug_print) {
         PrintSubarray("input v", v);
       }
 
-      // DeviceRuntime<DeviceType>::SyncDevice();
       v_fine.resize(hierarchy.shapes2[l]);
       w_fine.resize(hierarchy.shapes2[l]);
-      LwpkReo<D, T, COPY, DeviceType>().Execute(v_fine, w_fine, queue_idx);
-      // DeviceRuntime<DeviceType>::SyncDevice();
+      CopyND(v_fine, w_fine, queue_idx);
+
       v_coeff.resize(hierarchy.shapes2[l]);
-      calc_coefficients_3d(hierarchy, w_fine, v_coeff, l, queue_idx);
-      // DeviceRuntime<DeviceType>::SyncDevice();
+      CalcCoefficients3D(hierarchy, w_fine, v_coeff, l, queue_idx);
+
       w_correction.resize(hierarchy.shapes2[l]);
-      calc_correction_3d(hierarchy, v_coeff, w_correction, l, queue_idx);
-      // DeviceRuntime<DeviceType>::SyncDevice();
+      CalcCorrection3D(hierarchy, v_coeff, w_correction, l, queue_idx);
 
       w_correction.resize(hierarchy.shapes2[l + 1]);
       v_coarse.resize(hierarchy.shapes2[l + 1]);
-      LwpkReo<D, T, ADD, DeviceType>().Execute(w_correction, v_coarse,
-                                               queue_idx);
-      // DeviceRuntime<DeviceType>::SyncDevice();
-      if (debug_print) {
+      AddND(w_correction, v_coarse, queue_idx);
+      if (multidim_refactoring_debug_print) {
         PrintSubarray("after add", v);
       }
     } // end of loop
 
-    if (debug_print) {
+    if (multidim_refactoring_debug_print) {
       PrintSubarray("output of decomposition", v);
     }
   }
 
-  if (D > 3) {
+  if constexpr (D > 3) {
     Array<D, T, DeviceType> workspace2(workspace_shape);
     SubArray b(workspace2);
     SubArray<D, T, DeviceType> b_fine = b;
     for (int l = 0; l < l_target; ++l) {
-      if (debug_print) { // debug
+      if (multidim_refactoring_debug_print) { // debug
         PrintSubarray4D("before coeff", v);
       }
 
-      // std::vector<SIZE> shape(hierarchy.D_padded);
-      // for (DIM d = 0; d < hierarchy.D_padded; d++) shape[d] =
-      // hierarchy.shapes_h[l][d];
-
-      // gpuErrchk(cudaDeviceSynchronize());
-
       v_fine.resize(hierarchy.shapes2[l]);
       w_fine.resize(hierarchy.shapes2[l]);
-      LwpkReo<D, T, COPY, DeviceType>().Execute(v_fine, w_fine, queue_idx);
+      CopyND(v_fine, w_fine, queue_idx);
 
       v_fine.resize(hierarchy.shapes2[l]);
       b_fine.resize(hierarchy.shapes2[l]);
-      LwpkReo<D, T, COPY, DeviceType>().Execute(v_fine, b_fine, queue_idx);
-      // gpuErrchk(cudaDeviceSynchronize());
+      CopyND(v_fine, b_fine, queue_idx);
+
       v_coeff.resize(hierarchy.shapes2[l]);
-      calc_coefficients_nd(hierarchy, w_fine, b_fine, v_coeff, l, queue_idx);
+      CalcCoefficientsND(hierarchy, w_fine, b_fine, v_coeff, l, queue_idx);
 
-      if (debug_print) { // debug
+      if (multidim_refactoring_debug_print) { // debug
         PrintSubarray4D(format("after coeff[%d]", l), v_coeff);
       } // debug
 
-      // gpuErrchk(cudaDeviceSynchronize());
       w_correction.resize(hierarchy.shapes2[l]);
-      calc_correction_nd(hierarchy, v_coeff, w_correction, l, 0);
-      // gpuErrchk(cudaDeviceSynchronize());
+      CalcCorrectionND(hierarchy, v_coeff, w_correction, l, queue_idx);
 
       w_correction.resize(hierarchy.shapes2[l + 1]);
       v_coarse.resize(hierarchy.shapes2[l + 1]);
-      LwpkReo<D, T, ADD, DeviceType>().Execute(w_correction, v_coarse,
-                                               queue_idx);
-      // gpuErrchk(cudaDeviceSynchronize());
-
-      if (debug_print) { // debug
+      AddND(w_correction, v_coarse, queue_idx);
+      if (multidim_refactoring_debug_print) { // debug
         PrintSubarray4D(format("after apply correction[%d]", l), v);
       } // debug
     }
   }
+  DeviceRuntime<DeviceType>::SyncDevice();
 }
 
 template <DIM D, typename T, typename DeviceType>
@@ -1809,8 +126,8 @@ void recompose(Hierarchy<D, T, DeviceType> &hierarchy,
   SubArray<D, T, DeviceType> w_correction = w;
   SubArray<D, T, DeviceType> v_coarse = v;
 
-  if (D <= 3) {
-    if (debug_print) {
+  if constexpr (D <= 3) {
+    if (multidim_refactoring_debug_print) {
       PrintSubarray("input of recomposition", v);
     }
     std::string prefix = "recomp_";
@@ -1823,36 +140,32 @@ void recompose(Hierarchy<D, T, DeviceType> &hierarchy,
     // std::cout << prefix << std::endl;
 
     for (int l = l_target - 1; l >= 0; l--) {
-
       v_coeff.resize(hierarchy.shapes2[l]);
       w_correction.resize(hierarchy.shapes2[l]);
-      calc_correction_3d(hierarchy, v_coeff, w_correction, l, 0);
+      CalcCorrection3D(hierarchy, v_coeff, w_correction, l, queue_idx);
 
       w_correction.resize(hierarchy.shapes2[l + 1]);
       v_coarse.resize(hierarchy.shapes2[l + 1]);
-      LwpkReo<D, T, SUBTRACT, DeviceType>().Execute(w_correction, v_coarse,
-                                                    queue_idx);
+      SubtractND(w_correction, v_coarse, queue_idx);
 
       v_coeff.resize(hierarchy.shapes2[l]);
       w_fine.resize(hierarchy.shapes2[l]);
-      coefficients_restore_3d(hierarchy, v_coeff, w_fine, l, 0);
+      CoefficientsRestore3D(hierarchy, v_coeff, w_fine, l, queue_idx);
 
       v_fine.resize(hierarchy.shapes2[l]);
-      LwpkReo<D, T, COPY, DeviceType>().Execute(w_fine, v_fine, queue_idx);
-      // gpuErrchk(cudaDeviceSynchronize());
-
-      if (debug_print) {
+      CopyND(w_fine, v_fine, queue_idx);
+      if (multidim_refactoring_debug_print) {
         PrintSubarray("output of recomposition", v);
       }
     }
   }
-  if (D > 3) {
+  if constexpr (D > 3) {
     Array<D, T, DeviceType> workspace2(workspace_shape);
     SubArray b(workspace2);
     SubArray<D, T, DeviceType> b_fine = b;
     for (int l = l_target - 1; l >= 0; l--) {
 
-      if (debug_print) { // debug
+      if (multidim_refactoring_debug_print) { // debug
         PrintSubarray4D(format("before corection[%d]", l), v);
       }
 
@@ -1860,50 +173,39 @@ void recompose(Hierarchy<D, T, DeviceType> &hierarchy,
       int lddv1, lddv2;
       int lddw1, lddw2;
       int lddb1, lddb2;
-      // un-apply correction
-      // std::vector<SIZE> shape(hierarchy.D_padded);
-      // for (DIM d = 0; d < hierarchy.D_padded; d++) shape[d] =
-      // hierarchy.shapes_h[l][d];
 
-      if (debug_print) { // debug
+      if (multidim_refactoring_debug_print) { // debug
         PrintSubarray4D(format("before subtract correction[%d]", l), v);
       } // deb
 
-      // gpuErrchk(cudaDeviceSynchronize());
       v_coeff.resize(hierarchy.shapes2[l]);
       w_correction.resize(hierarchy.shapes2[l]);
-      calc_correction_nd(hierarchy, v_coeff, w_correction, l, 0);
+      CalcCorrectionND(hierarchy, v_coeff, w_correction, l, queue_idx);
 
       w_correction.resize(hierarchy.shapes2[l + 1]);
       v_coarse.resize(hierarchy.shapes2[l + 1]);
-      // gpuErrchk(cudaDeviceSynchronize());
-      LwpkReo<D, T, SUBTRACT, DeviceType>().Execute(w_correction, v_coarse,
-                                                    queue_idx);
-      // gpuErrchk(cudaDeviceSynchronize());
+      SubtractND(w_correction, v_coarse, queue_idx);
 
-      if (debug_print) { // debug
+      if (multidim_refactoring_debug_print) { // debug
         PrintSubarray4D(format("after subtract correction[%d]", l), v);
       } // deb
 
       v_coeff.resize(hierarchy.shapes2[l]);
       w_fine.resize(hierarchy.shapes2[l]);
       b_fine.resize(hierarchy.shapes2[l]);
-      // gpuErrchk(cudaDeviceSynchronize());
-      LwpkReo<D, T, COPY, DeviceType>().Execute(v_coeff, b_fine, queue_idx);
-      LwpkReo<D, T, COPY, DeviceType>().Execute(v_coeff, w_fine, queue_idx);
-      // gpuErrchk(cudaDeviceSynchronize());
+      CopyND(v_coeff, b_fine, queue_idx);
+      CopyND(v_coeff, w_fine, queue_idx);
       v_fine.resize(hierarchy.shapes2[l]);
-      coefficients_restore_nd(hierarchy, w_fine, b_fine, v_fine, l, queue_idx);
+      CoefficientsRestoreND(hierarchy, w_fine, b_fine, v_fine, l, queue_idx);
 
     } // loop levels
 
-    if (debug_print) { // debug
+    if (multidim_refactoring_debug_print) { // debug
       std::vector<SIZE> shape(hierarchy.D_padded);
-      // for (DIM d = 0; d < hierarchy.D_padded; d++) shape[d] =
-      // hierarchy.shapes_h[0][d];
       PrintSubarray4D(format("final output"), v);
     } // deb
   }   // D > 3
+  DeviceRuntime<DeviceType>::SyncDevice();
 }
 
 } // namespace mgard_x
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.hpp
new file mode 100644
index 0000000000..87a64199b2
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "CoefficientKernel.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS
+#define MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCoefficients(DIM current_dim, SubArray<1, T, DeviceType> ratio,
+                      SubArray<D, T, DeviceType> v,
+                      SubArray<D, T, DeviceType> coarse,
+                      SubArray<D, T, DeviceType> coeff, int queue_idx) {
+
+  SingleDimensionCoefficient<D, T, DECOMPOSE, DeviceType>().Execute(
+      current_dim, ratio, v, coarse, coeff, queue_idx);
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientKernel.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientKernel.hpp
index c2e5355e06..16245fdf3a 100644
--- a/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientKernel.hpp
+++ b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientKernel.hpp
@@ -195,13 +195,12 @@ class SingleDimensionCoefficient : public AutoTuner<DeviceType> {
                SubArray<D, T, DeviceType> v, SubArray<D, T, DeviceType> coarse,
                SubArray<D, T, DeviceType> coeff, int queue_idx) {
     int range_l = std::min(6, (int)std::log2(coeff.getShape(0)) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
     int config =
         AutoTuner<DeviceType>::autoTuningTable.gpk_reo_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define GPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -214,22 +213,27 @@ class SingleDimensionCoefficient : public AutoTuner<DeviceType> {
     TaskType task =                                                            \
         GenTask<R, C, F>(current_dim, ratio, v, coarse, coeff, queue_idx);     \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    GPK(0)
-    GPK(1)
-    GPK(2)
-    GPK(3)
-    GPK(4)
-    GPK(5)
-    GPK(6)
+    GPK(6) if (!ret.success) config--;
+    GPK(5) if (!ret.success) config--;
+    GPK(4) if (!ret.success) config--;
+    GPK(3) if (!ret.success) config--;
+    GPK(2) if (!ret.success) config--;
+    GPK(1) if (!ret.success) config--;
+    GPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err
+                << "no suitable config for SingleDimensionCoefficient.\n";
+      exit(-1);
+    }
 #undef GPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.hpp
new file mode 100644
index 0000000000..c2c6c02517
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "CoefficientKernel.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE
+#define MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CoefficientsRestore(DIM current_dim, SubArray<1, T, DeviceType> ratio,
+                         SubArray<D, T, DeviceType> v,
+                         SubArray<D, T, DeviceType> coarse,
+                         SubArray<D, T, DeviceType> coeff, int queue_idx) {
+
+  SingleDimensionCoefficient<D, T, RECOMPOSE, DeviceType>().Execute(
+      current_dim, ratio, v, coarse, coeff, queue_idx);
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.hpp
new file mode 100644
index 0000000000..ea95a4f0b9
--- /dev/null
+++ b/include/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "../../../Hierarchy/Hierarchy.hpp"
+#include "../../../RuntimeX/RuntimeX.h"
+
+#include "../DataRefactoring.h"
+
+#include "../../MultiDimension/Correction/IterativeProcessingKernel.hpp"
+#include "../../MultiDimension/Correction/LevelwiseProcessingKernel.hpp"
+
+#include "MassTransKernel.hpp"
+
+#ifndef MGARD_X_DATA_REFACTORING_CALC_CORRECTION
+#define MGARD_X_DATA_REFACTORING_CALC_CORRECTION
+
+namespace mgard_x {
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCorrection(Hierarchy<D, T, DeviceType> &hierarchy,
+                    SubArray<D, T, DeviceType> &coeff,
+                    SubArray<D, T, DeviceType> &correction, SIZE curr_dim,
+                    SIZE l, int queue_idx) {
+
+  SingleDimensionMassTrans<D, T, DeviceType>().Execute(
+      curr_dim, SubArray(hierarchy.dist_array[curr_dim][l]),
+      SubArray(hierarchy.ratio_array[curr_dim][l]), coeff, correction,
+      queue_idx);
+
+  if (singledim_refactoring_debug_print) {
+    PrintSubarray("SingleDimensionMassTrans", correction);
+  }
+
+  DIM curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
+  if (curr_dim == 0) {
+    correction.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    Ipk1Reo<D, T, DeviceType>().Execute(
+        curr_dim_r, curr_dim_c, curr_dim_f,
+        SubArray(hierarchy.am_array[curr_dim_f][l + 1]),
+        SubArray(hierarchy.bm_array[curr_dim_f][l + 1]), correction, queue_idx);
+    if (singledim_refactoring_debug_print) {
+      PrintSubarray("Ipk1Reo", correction);
+    }
+
+  } else if (curr_dim == 1) {
+    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
+    correction.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    Ipk2Reo<D, T, DeviceType>().Execute(
+        curr_dim_r, curr_dim_c, curr_dim_f,
+        SubArray(hierarchy.am_array[curr_dim_c][l + 1]),
+        SubArray(hierarchy.bm_array[curr_dim_c][l + 1]), correction, queue_idx);
+    if (singledim_refactoring_debug_print) {
+      PrintSubarray("Ipk2Reo", correction);
+    }
+  } else if (curr_dim == 2) {
+    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
+    correction.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    Ipk3Reo<D, T, DeviceType>().Execute(
+        curr_dim_r, curr_dim_c, curr_dim_f,
+        SubArray(hierarchy.am_array[curr_dim_r][l + 1]),
+        SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), correction, queue_idx);
+    if (singledim_refactoring_debug_print) {
+      PrintSubarray("Ipk3Reo", correction);
+    }
+  } else {
+    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = curr_dim;
+    correction.project(curr_dim_f, curr_dim_c, curr_dim_r);
+    Ipk3Reo<D, T, DeviceType>().Execute(
+        curr_dim_r, curr_dim_c, curr_dim_f,
+        SubArray(hierarchy.am_array[curr_dim_r][l + 1]),
+        SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), correction, queue_idx);
+    if (singledim_refactoring_debug_print) {
+      PrintSubarray("Ipk3Reo", correction);
+    }
+  }
+}
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Correction/MassTransKernel.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Correction/MassTransKernel.hpp
index cb434ca5a3..079dceae1d 100644
--- a/include/mgard-x/DataRefactoring/SingleDimension/Correction/MassTransKernel.hpp
+++ b/include/mgard-x/DataRefactoring/SingleDimension/Correction/MassTransKernel.hpp
@@ -167,13 +167,12 @@ class SingleDimensionMassTrans : public AutoTuner<DeviceType> {
                SubArray<D, T, DeviceType> coeff, SubArray<D, T, DeviceType> v,
                int queue_idx) {
     int range_l = std::min(6, (int)std::log2(v.getShape(0)) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
     int prec = TypeToIdx<T>();
     int config =
         AutoTuner<DeviceType>::autoTuningTable.gpk_reo_nd[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define GPK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -186,22 +185,27 @@ class SingleDimensionMassTrans : public AutoTuner<DeviceType> {
     TaskType task =                                                            \
         GenTask<R, C, F>(current_dim, dist, ratio, coeff, v, queue_idx);       \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
 
-    GPK(0)
-    GPK(1)
-    GPK(2)
-    GPK(3)
-    GPK(4)
-    GPK(5)
-    GPK(6)
+    GPK(6) if (!ret.success) config--;
+    GPK(5) if (!ret.success) config--;
+    GPK(4) if (!ret.success) config--;
+    GPK(3) if (!ret.success) config--;
+    GPK(2) if (!ret.success) config--;
+    GPK(1) if (!ret.success) config--;
+    GPK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err
+                << "no suitable config for SingleDimensionMassTrans.\n";
+      exit(-1);
+    }
 #undef GPK
 
     if (AutoTuner<DeviceType>::ProfileKernels) {
diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Correction/SolveTridiagKernel.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Correction/SolveTridiagKernel.hpp
deleted file mode 100644
index 7ece383ad7..0000000000
--- a/include/mgard-x/DataRefactoring/SingleDimension/Correction/SolveTridiagKernel.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_SINGLE_DIMENSION_SOLVE_TRIDIAG_KERNEL_TEMPLATE
-#define MGARD_X_SINGLE_DIMENSION_SOLVE_TRIDIAG_KERNEL_TEMPLATE
-
-#include "../../../RuntimeX/RuntimeX.h"
-
-// #include "../../MultiDimension/Correction/LPKFunctor.h"
-
-namespace mgard_x {
-
-template <typename T, SIZE F, typename DeviceType>
-class ForwardPassMultCoefficientFunctor : public Functor<DeviceType> {
-public:
-  MGARDX_CONT ForwardPassMultCoefficientFunctor() {}
-  MGARDX_CONT
-  ForwardPassMultCoefficientFunctor(SubArray<1, T, DeviceType> am,
-                                    SubArray<1, T, DeviceType> bm,
-                                    SubArray<1, T, DeviceType> amXbm)
-      : am(am), bm(bm), (amXbm) {
-    Functor<DeviceType>();
-  }
-
-  MGARDX_EXEC void Operation1() {
-    SIZE id = FunctorBase<DeviceType>::GetBlockIdX() *
-                  FunctorBase<DeviceType>::GetBlockDimX() +
-              FunctorBase<DeviceType>::GetThreadIdX();
-
-    if (id < am.getShape(0)) {
-      *amXbm(id) = (*am(id)) * (*bm(id));
-    }
-  }
-
-  MGARDX_CONT size_t shared_memory_size() { return 0; }
-
-private:
-  // functor parameters
-  SubArray<1, T, DeviceType> am;
-  SubArray<1, T, DeviceType> bm;
-  SubArray<1, T, DeviceType> amXbm;
-};
-
-template <DIM D, typename T, typename DeviceType>
-class SingleDimensionSolveTridiag : public AutoTuner<DeviceType> {
-public:
-  MGARDX_CONT
-  SingleDimensionSolveTridiag() : AutoTuner<DeviceType>() {}
-
-  template <SIZE F>
-  MGARDX_CONT Task<ForwardPassMultCoefficient<T, F, DeviceType>>
-  GenTask(SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm,
-          SubArray<1, T, DeviceType> amXbm, int queue_idx) {
-
-    using FunctorType = ForwardPassMultCoefficientFunctor<T, F, DeviceType>;
-    FunctorType functor(am, bm, amXbm);
-
-    SIZE nf = v.getShape(0);
-    SIZE total_thread_x = nf;
-
-    SIZE tbx, tby, tbz, gridx, gridy, gridz;
-    size_t sm_size = functor.shared_memory_size();
-
-    tbz = 1;
-    tby = 1;
-    tbx = F;
-    gridz = 1;
-    gridy = 1;
-    gridx = ceil((float)total_thread_x / tbx);
-
-    return Task(functor, gridz, gridy, gridx, tbz, tby, tbx, sm_size, queue_idx,
-                "ForwardPassMultCoefficient");
-  }
-
-  MGARDX_CONT
-  void Execute(DIM current_dim, SubArray<1, T, DeviceType> dist,
-               SubArray<1, T, DeviceType> ratio,
-               SubArray<D, T, DeviceType> coeff, SubArray<D, T, DeviceType> v,
-               SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm,
-               int queue_idx) {
-
-    Array<1, T, DeviceType> amXbm(am.getShape(0));
-
-    int range_l = std::min(6, (int)std::log2(coeff.getShape(0)) - 1);
-    int arch = DeviceRuntime<DeviceType>::GetArchitectureGeneration();
-    int prec = TypeToIdx<T>();
-    int config =
-        AutoTuner<DeviceType>::autoTuningTable.gpk_reo_nd[prec][range_l];
-
-    double min_time = std::numeric_limits<double>::max();
-    int min_config = 0;
-
-#define GPK(CONFIG)                                                            \
-  if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
-    const int F = GPK_CONFIG[D - 1][CONFIG][2];                                \
-    using FunctorType = ForwardPassMultCoefficient<T, F, DeviceType>;          \
-    using TaskType = Task<FunctorType>;                                        \
-    TaskType task = GenTask<F>(am, bm, SubArray(amXbm), queue_idx);            \
-    DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
-    if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
-        min_time = ret.execution_time;                                         \
-        min_config = CONFIG;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }
-
-    GPK(0)
-    GPK(1)
-    GPK(2)
-    GPK(3)
-    GPK(4)
-    GPK(5)
-    GPK(6)
-#undef GPK
-
-    if (AutoTuner<DeviceType>::ProfileKernels) {
-      FillAutoTunerTable<DeviceType>("SingleDimensionSolveTridiag", prec,
-                                     range_l, min_config);
-    }
-  }
-};
-
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.h b/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.h
index 79178e71bc..f68b0b9a02 100644
--- a/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.h
+++ b/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.h
@@ -8,11 +8,45 @@
 #ifndef MGARD_X_SINGLE_DIMENSION_DATA_REFACTORING
 #define MGARD_X_SINGLE_DIMENSION_DATA_REFACTORING
 
-#include "../../Hierarchy.h"
+#include "../../Hierarchy/Hierarchy.h"
 #include "../../RuntimeX/RuntimeXPublic.h"
 
 namespace mgard_x {
 
+static bool singledim_refactoring_store = false;
+static bool singledim_refactoring_verify = false;
+static bool singledim_refactoring_debug_print = false;
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCoefficients(DIM current_dim, SubArray<1, T, DeviceType> ratio,
+                      SubArray<D, T, DeviceType> v,
+                      SubArray<D, T, DeviceType> coarse,
+                      SubArray<D, T, DeviceType> coeff, int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void CoefficientsRestore(DIM current_dim, SubArray<1, T, DeviceType> ratio,
+                         SubArray<D, T, DeviceType> v,
+                         SubArray<D, T, DeviceType> coarse,
+                         SubArray<D, T, DeviceType> coeff, int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void CalcCorrection(Hierarchy<D, T, DeviceType> &hierarchy,
+                    SubArray<D, T, DeviceType> &coeff,
+                    SubArray<D, T, DeviceType> &correction, SIZE curr_dim,
+                    SIZE l, int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void CopyND(SubArray<D, T, DeviceType> dinput,
+            SubArray<D, T, DeviceType> &doutput, int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void AddND(SubArray<D, T, DeviceType> dinput,
+           SubArray<D, T, DeviceType> &doutput, int queue_idx);
+
+template <DIM D, typename T, typename DeviceType>
+void SubtractND(SubArray<D, T, DeviceType> dinput,
+                SubArray<D, T, DeviceType> &doutput, int queue_idx);
+
 template <DIM D, typename T, typename DeviceType>
 void decompose_single(Hierarchy<D, T, DeviceType> &hierarchy,
                       SubArray<D, T, DeviceType> &v, SIZE l_target,
diff --git a/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp b/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp
index c86e515178..86ade65620 100644
--- a/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp
+++ b/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp
@@ -5,75 +5,13 @@
  * Date: March 17, 2022
  */
 
-#include "../../Hierarchy.hpp"
+#include "../../Hierarchy/Hierarchy.hpp"
 #include "../../RuntimeX/RuntimeX.h"
 
-#include "../MultiDimension/Correction/IterativeProcessingKernel.hpp"
-#include "../MultiDimension/Correction/LevelwiseProcessingKernel.hpp"
-#include "Coefficient/CoefficientKernel.hpp"
-#include "Correction/MassTransKernel.hpp"
+#include "DataRefactoring.h"
 
 namespace mgard_x {
 
-template <DIM D, typename T, typename DeviceType>
-void calc_correction_single(Hierarchy<D, T, DeviceType> &hierarchy,
-                            SubArray<D, T, DeviceType> &coeff,
-                            SubArray<D, T, DeviceType> &correction,
-                            SIZE curr_dim, SIZE l, int queue_idx) {
-
-  SingleDimensionMassTrans<D, T, DeviceType>().Execute(
-      curr_dim, SubArray(hierarchy.dist_array[curr_dim][l]),
-      SubArray(hierarchy.ratio_array[curr_dim][l]), coeff, correction,
-      queue_idx);
-
-  if (debug_print) {
-    PrintSubarray("SingleDimensionMassTrans", correction);
-  }
-
-  DIM curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
-  if (curr_dim == 0) {
-    correction.project(curr_dim_f, curr_dim_c, curr_dim_r);
-    Ipk1Reo<D, T, DeviceType>().Execute(
-        curr_dim_r, curr_dim_c, curr_dim_f,
-        SubArray(hierarchy.am_array[curr_dim_f][l + 1]),
-        SubArray(hierarchy.bm_array[curr_dim_f][l + 1]), correction, queue_idx);
-    if (debug_print) {
-      PrintSubarray("Ipk1Reo", correction);
-    }
-
-  } else if (curr_dim == 1) {
-    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
-    correction.project(curr_dim_f, curr_dim_c, curr_dim_r);
-    Ipk2Reo<D, T, DeviceType>().Execute(
-        curr_dim_r, curr_dim_c, curr_dim_f,
-        SubArray(hierarchy.am_array[curr_dim_c][l + 1]),
-        SubArray(hierarchy.bm_array[curr_dim_c][l + 1]), correction, queue_idx);
-    if (debug_print) {
-      PrintSubarray("Ipk2Reo", correction);
-    }
-  } else if (curr_dim == 2) {
-    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2;
-    correction.project(curr_dim_f, curr_dim_c, curr_dim_r);
-    Ipk3Reo<D, T, DeviceType>().Execute(
-        curr_dim_r, curr_dim_c, curr_dim_f,
-        SubArray(hierarchy.am_array[curr_dim_r][l + 1]),
-        SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), correction, queue_idx);
-    if (debug_print) {
-      PrintSubarray("Ipk3Reo", correction);
-    }
-  } else {
-    curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = curr_dim;
-    correction.project(curr_dim_f, curr_dim_c, curr_dim_r);
-    Ipk3Reo<D, T, DeviceType>().Execute(
-        curr_dim_r, curr_dim_c, curr_dim_f,
-        SubArray(hierarchy.am_array[curr_dim_r][l + 1]),
-        SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), correction, queue_idx);
-    if (debug_print) {
-      PrintSubarray("Ipk3Reo", correction);
-    }
-  }
-}
-
 template <DIM D, typename T, typename DeviceType>
 void decompose_single(Hierarchy<D, T, DeviceType> &hierarchy,
                       SubArray<D, T, DeviceType> &v, SIZE l_target,
@@ -85,13 +23,13 @@ void decompose_single(Hierarchy<D, T, DeviceType> &hierarchy,
   Array<D, T, DeviceType> workspace(workspace_shape);
   SubArray w(workspace);
 
-  if (debug_print) {
+  if (singledim_refactoring_debug_print) {
     PrintSubarray("Input", v);
   }
 
   for (int l = 0; l < l_target; ++l) {
     for (int curr_dim = 0; curr_dim < D; curr_dim++) {
-      if (debug_print) {
+      if (singledim_refactoring_debug_print) {
         std::cout << "l: " << l << " curr_dim: " << curr_dim << "\n";
       }
       std::vector<SIZE> fine_shape(D);
@@ -133,28 +71,26 @@ void decompose_single(Hierarchy<D, T, DeviceType> &hierarchy,
       SubArray<D, T, DeviceType> correction = w;
       correction.resize(coarse_shape);
 
-      LwpkReo<D, T, COPY, DeviceType>().Execute(v_fine, w_fine, queue_idx);
+      CopyND(v_fine, w_fine, queue_idx);
 
-      if (debug_print) {
+      if (singledim_refactoring_debug_print) {
         PrintSubarray("COPY", w_fine);
       }
 
-      SingleDimensionCoefficient<D, T, DECOMPOSE, DeviceType>().Execute(
-          curr_dim, SubArray(hierarchy.ratio_array[curr_dim][l]), w_fine,
-          coarse, coeff, queue_idx);
+      CalcCoefficients(curr_dim, SubArray(hierarchy.ratio_array[curr_dim][l]),
+                       w_fine, coarse, coeff, queue_idx);
 
-      if (debug_print) {
+      if (singledim_refactoring_debug_print) {
         PrintSubarray("SingleDimensionCoefficient - fine", w_fine);
         PrintSubarray("SingleDimensionCoefficient - corase", coarse);
         PrintSubarray("SingleDimensionCoefficient - coeff", coeff);
       }
 
-      calc_correction_single(hierarchy, coeff, correction, curr_dim, l,
-                             queue_idx);
+      CalcCorrection(hierarchy, coeff, correction, curr_dim, l, queue_idx);
 
-      LwpkReo<D, T, ADD, DeviceType>().Execute(correction, coarse, queue_idx);
+      AddND(correction, coarse, queue_idx);
 
-      if (debug_print) {
+      if (singledim_refactoring_debug_print) {
         PrintSubarray("ADD", coarse);
       }
 
@@ -173,13 +109,13 @@ void recompose_single(Hierarchy<D, T, DeviceType> &hierarchy,
   Array<D, T, DeviceType> workspace(workspace_shape);
   SubArray w(workspace);
 
-  if (debug_print) {
+  if (singledim_refactoring_debug_print) {
     PrintSubarray("Input", v);
   }
 
   for (int l = l_target - 1; l >= 0; --l) {
     for (int curr_dim = D - 1; curr_dim >= 0; curr_dim--) {
-      if (debug_print) {
+      if (singledim_refactoring_debug_print) {
         std::cout << "l: " << l << " curr_dim: " << curr_dim << "\n";
       }
       std::vector<SIZE> fine_shape(D);
@@ -220,29 +156,27 @@ void recompose_single(Hierarchy<D, T, DeviceType> &hierarchy,
       SubArray<D, T, DeviceType> correction = w;
       correction.resize(coarse_shape);
 
-      calc_correction_single(hierarchy, coeff, correction, curr_dim, l,
-                             queue_idx);
+      CalcCorrection(hierarchy, coeff, correction, curr_dim, l, queue_idx);
 
-      LwpkReo<D, T, SUBTRACT, DeviceType>().Execute(correction, coarse,
-                                                    queue_idx);
+      SubtractND(correction, coarse, queue_idx);
 
-      if (debug_print) {
+      if (singledim_refactoring_debug_print) {
         PrintSubarray("SUBTRACT", coarse);
       }
 
-      SingleDimensionCoefficient<D, T, RECOMPOSE, DeviceType>().Execute(
-          curr_dim, SubArray(hierarchy.ratio_array[curr_dim][l]), w_fine,
-          coarse, coeff, queue_idx);
+      CoefficientsRestore(curr_dim,
+                          SubArray(hierarchy.ratio_array[curr_dim][l]), w_fine,
+                          coarse, coeff, queue_idx);
 
-      if (debug_print) {
+      if (singledim_refactoring_debug_print) {
         PrintSubarray("SingleDimensionCoefficient - fine", w_fine);
         PrintSubarray("SingleDimensionCoefficient - corase", coarse);
         PrintSubarray("SingleDimensionCoefficient - coeff", coeff);
       }
 
-      LwpkReo<D, T, COPY, DeviceType>().Execute(w_fine, v_fine, queue_idx);
+      CopyND(w_fine, v_fine, queue_idx);
 
-      if (debug_print) {
+      if (singledim_refactoring_debug_print) {
         PrintSubarray("COPY", v_fine);
       }
 
diff --git a/include/mgard-x/Hierarchy/CMakeLists.txt b/include/mgard-x/Hierarchy/CMakeLists.txt
new file mode 100644
index 0000000000..d02867c67b
--- /dev/null
+++ b/include/mgard-x/Hierarchy/CMakeLists.txt
@@ -0,0 +1,5 @@
+list(APPEND MGARD_X_HEADER
+    ${CMAKE_CURRENT_SOURCE_DIR}/Hierarchy.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Hierarchy.hpp
+    )
+set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE)
\ No newline at end of file
diff --git a/include/mgard-x/Hierarchy.h b/include/mgard-x/Hierarchy/Hierarchy.h
similarity index 97%
rename from include/mgard-x/Hierarchy.h
rename to include/mgard-x/Hierarchy/Hierarchy.h
index 2553d38f0e..d4a3af8071 100644
--- a/include/mgard-x/Hierarchy.h
+++ b/include/mgard-x/Hierarchy/Hierarchy.h
@@ -8,8 +8,8 @@
 #ifndef MGARD_X_HANDLE
 #define MGARD_X_HANDLE
 
-#include "RuntimeX/RuntimeXPublic.h"
-#include "Types.h"
+#include "../RuntimeX/RuntimeXPublic.h"
+#include "../Utilities/Types.h"
 
 namespace mgard_x {
 
@@ -32,7 +32,7 @@ struct Config {
   int reorder;
 
   Config() {
-    dev_type = device_type::Auto;
+    dev_type = device_type::AUTO;
     dev_id = 0;
     decomposition = decomposition_type::MultiDim;
     l_target = -1; // no limit
diff --git a/include/mgard-x/Hierarchy.hpp b/include/mgard-x/Hierarchy/Hierarchy.hpp
similarity index 99%
rename from include/mgard-x/Hierarchy.hpp
rename to include/mgard-x/Hierarchy/Hierarchy.hpp
index db28f7cb64..0e5635b4de 100644
--- a/include/mgard-x/Hierarchy.hpp
+++ b/include/mgard-x/Hierarchy/Hierarchy.hpp
@@ -5,8 +5,8 @@
  * Date: March 17, 2022
  */
 
+#include "../RuntimeX/RuntimeX.h"
 #include "Hierarchy.h"
-#include "RuntimeX/RuntimeX.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/include/mgard-x/Linearization/CMakeLists.txt b/include/mgard-x/Linearization/CMakeLists.txt
new file mode 100644
index 0000000000..0e6faba3c8
--- /dev/null
+++ b/include/mgard-x/Linearization/CMakeLists.txt
@@ -0,0 +1,5 @@
+list(APPEND MGARD_X_HEADER
+    ${CMAKE_CURRENT_SOURCE_DIR}/LevelLinearizer.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/LevelLinearizer2.hpp
+    )
+set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE)
\ No newline at end of file
diff --git a/include/mgard-x/Linearization/LevelLinearizer2.hpp b/include/mgard-x/Linearization/LevelLinearizer2.hpp
index a8a7498250..7a77e431f4 100644
--- a/include/mgard-x/Linearization/LevelLinearizer2.hpp
+++ b/include/mgard-x/Linearization/LevelLinearizer2.hpp
@@ -316,8 +316,8 @@ class LevelLinearizer2 : public AutoTuner<DeviceType> {
     for (int d = 3; d < D; d++) {
       gridx *= shape.dataHost()[d];
     }
-    return Task(functor, gridz, gridy, gridx, tbz, tby, tbx, sm_size,
-                queue_idx);
+    return Task(functor, gridz, gridy, gridx, tbz, tby, tbx, sm_size, queue_idx,
+                "LevelLinearizer");
   }
 
   MGARDX_CONT
@@ -349,10 +349,10 @@ class LevelLinearizer2 : public AutoTuner<DeviceType> {
 
     int range_l = std::min(6, (int)std::log2(v.getShape(0)) - 1);
     int prec = TypeToIdx<T>();
-
     int config = AutoTuner<DeviceType>::autoTuningTable.llk[prec][range_l];
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define LLK(CONFIG)                                                            \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -365,21 +365,25 @@ class LevelLinearizer2 : public AutoTuner<DeviceType> {
     TaskType task =                                                            \
         GenTask<R, C, F>(shape, l_target, ranges, v, d_level_v, queue_idx);    \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
-    LLK(0)
-    LLK(1)
-    LLK(2)
-    LLK(3)
-    LLK(4)
-    LLK(5)
-    LLK(6)
+    LLK(6) if (!ret.success) config--;
+    LLK(5) if (!ret.success) config--;
+    LLK(4) if (!ret.success) config--;
+    LLK(3) if (!ret.success) config--;
+    LLK(2) if (!ret.success) config--;
+    LLK(1) if (!ret.success) config--;
+    LLK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err << "no suitable config for LevelLinearizer.\n";
+      exit(-1);
+    }
 #undef LLK
     if (AutoTuner<DeviceType>::ProfileKernels) {
       FillAutoTunerTable<DeviceType>("llk", prec, range_l, min_config);
diff --git a/include/mgard-x/Lossless/ParallelHuffman/GenerateCL.hpp b/include/mgard-x/Lossless/ParallelHuffman/GenerateCL.hpp
index 48ab3ba211..65bd36c41c 100644
--- a/include/mgard-x/Lossless/ParallelHuffman/GenerateCL.hpp
+++ b/include/mgard-x/Lossless/ParallelHuffman/GenerateCL.hpp
@@ -90,7 +90,6 @@ class GenerateCLFunctor : public HuffmanCLCustomizedFunctor<DeviceType> {
     // printf("LoopCondition1 %d %u %d\n", (*status((IDX)_lNodesCur)), size,
     // (*status((IDX)_iNodesSize)));
     return (*status((IDX)_lNodesCur)) < size || (*status((IDX)_iNodesSize)) > 1;
-    ;
   }
 
   MGARDX_EXEC void Operation2() {
@@ -239,7 +238,8 @@ class GenerateCLFunctor : public HuffmanCLCustomizedFunctor<DeviceType> {
       if (*lNodesFreq((IDX)i) <= (*status((IDX)_minFreq))) {
         threadCurLeavesNum = i - (*status((IDX)_lNodesCur)) + 1;
         // Atomic max -- Largest valid index
-        Atomic<DeviceType>::Max(status((IDX)_curLeavesNum), threadCurLeavesNum);
+        Atomic<int, AtomicGlobalMemory, AtomicDeviceScope, DeviceType>::Max(
+            status((IDX)_curLeavesNum), threadCurLeavesNum);
       }
 
       if (i - (*status((IDX)_lNodesCur)) < (*status((IDX)_curLeavesNum))) {
@@ -725,16 +725,18 @@ class GenerateCL : public AutoTuner<DeviceType> {
                SubArray<1, int, DeviceType> copyIndex,
                SubArray<1, uint32_t, DeviceType> diagonal_path_intersections,
                int queue_idx) {
-    Array<1, int, DeviceType> status({(SIZE)16}, false, true);
+    Array<1, int, DeviceType> status_array({(SIZE)16}, false, true);
+    SubArray status(status_array);
     using FunctorType = GenerateCLFunctor<T, DeviceType>;
     using TaskType = Task<FunctorType>;
-    TaskType task = GenTask(
-        histogram, CL, dict_size, lNodesFreq, lNodesLeader, iNodesFreq,
-        iNodesLeader, tempFreq, tempIsLeaf, tempIndex, copyFreq, copyIsLeaf,
-        copyIndex, diagonal_path_intersections, SubArray(status), queue_idx);
+    TaskType task = GenTask(histogram, CL, dict_size, lNodesFreq, lNodesLeader,
+                            iNodesFreq, iNodesLeader, tempFreq, tempIsLeaf,
+                            tempIndex, copyFreq, copyIsLeaf, copyIndex,
+                            diagonal_path_intersections, status, queue_idx);
     DeviceAdapter<TaskType, DeviceType> adapter;
 
     adapter.Execute(task);
+    DeviceRuntime<DeviceType>::SyncAllQueues();
   }
 };
 
diff --git a/include/mgard-x/Lossless/ParallelHuffman/GenerateCW.hpp b/include/mgard-x/Lossless/ParallelHuffman/GenerateCW.hpp
index 3698eb0eb9..57d2906dca 100644
--- a/include/mgard-x/Lossless/ParallelHuffman/GenerateCW.hpp
+++ b/include/mgard-x/Lossless/ParallelHuffman/GenerateCW.hpp
@@ -95,7 +95,8 @@ class GenerateCWFunctor : public HuffmanCWCustomizedFunctor<DeviceType> {
         FunctorBase<DeviceType>::GetThreadIdX();
     // (*status((IDX)_CDPI)) update
     if (i < size - 1 && *CL((IDX)i + 1) > (*status((IDX)_CCL))) {
-      Atomic<DeviceType>::Min(&(*status((IDX)_newCDPI)), (int)i);
+      Atomic<int, AtomicGlobalMemory, AtomicDeviceScope, DeviceType>::Min(
+          &(*status((IDX)_newCDPI)), (int)i);
     }
   }
 
@@ -313,6 +314,7 @@ class GenerateCW : public AutoTuner<DeviceType> {
         GenTask(CL, CW, first, entry, dict_size, SubArray(status), queue_idx);
     DeviceAdapter<TaskType, DeviceType> adapter;
     adapter.Execute(task);
+    DeviceRuntime<DeviceType>::SyncAllQueues();
   }
 };
 
diff --git a/include/mgard-x/Lossless/ParallelHuffman/GetFirstNonzeroIndex.hpp b/include/mgard-x/Lossless/ParallelHuffman/GetFirstNonzeroIndex.hpp
index e45f6cf215..039e867d2d 100644
--- a/include/mgard-x/Lossless/ParallelHuffman/GetFirstNonzeroIndex.hpp
+++ b/include/mgard-x/Lossless/ParallelHuffman/GetFirstNonzeroIndex.hpp
@@ -28,7 +28,8 @@ class GetFirstNonzeroIndexFunctor : public Functor<DeviceType> {
                            FunctorBase<DeviceType>::GetBlockDimX()) +
                           FunctorBase<DeviceType>::GetThreadIdX();
     if (thread < size && *array(thread) != 0) {
-      Atomic<DeviceType>::Min(result((IDX)0), thread);
+      Atomic<unsigned int, AtomicGlobalMemory, AtomicDeviceScope,
+             DeviceType>::Min(result((IDX)0), thread);
     }
   }
 
diff --git a/include/mgard-x/Lossless/ParallelHuffman/Histogram.hpp b/include/mgard-x/Lossless/ParallelHuffman/Histogram.hpp
index 773a422744..7786ebd388 100644
--- a/include/mgard-x/Lossless/ParallelHuffman/Histogram.hpp
+++ b/include/mgard-x/Lossless/ParallelHuffman/Histogram.hpp
@@ -46,15 +46,24 @@ class HistogramFunctor : public Functor<DeviceType> {
     if (warpid >= warps_block - 1)
       end = N;
 
-    for (unsigned int pos = FunctorBase<DeviceType>::GetThreadIdX();
-         pos < (bins)*R; pos += FunctorBase<DeviceType>::GetBlockDimX())
-      Hs[pos] = 0;
+    if (CACHE_HISTOGRAM) {
+      for (unsigned int pos = FunctorBase<DeviceType>::GetThreadIdX();
+           pos < (bins)*R; pos += FunctorBase<DeviceType>::GetBlockDimX()) {
+        Hs[pos] = 0;
+      }
+    }
   }
 
   MGARDX_EXEC void Operation2() {
     for (unsigned int i = begin; i < end; i += step) {
       int d = *input_data(i);
-      Atomic<DeviceType>::Add(&Hs[off_rep + d], 1);
+      if (CACHE_HISTOGRAM) {
+        Atomic<int, AtomicSharedMemory, AtomicDeviceScope, DeviceType>::Add(
+            &Hs[off_rep + d], 1);
+      } else {
+        Atomic<int, AtomicGlobalMemory, AtomicDeviceScope, DeviceType>::Add(
+            &Hs[off_rep + d], 1);
+      }
     }
   }
 
@@ -65,7 +74,8 @@ class HistogramFunctor : public Functor<DeviceType> {
       for (int base = 0; base < (bins)*R; base += bins) {
         sum += Hs[base + pos];
       }
-      Atomic<DeviceType>::Add(output(pos), (Q)sum);
+      Atomic<Q, AtomicGlobalMemory, AtomicDeviceScope, DeviceType>::Add(
+          output(pos), (Q)sum);
     }
   }
 
@@ -120,9 +130,10 @@ class Histogram : public AutoTuner<DeviceType> {
 
     threadsPerBlock =
         ((((numValues / (numBlocks * itemsPerThread)) + 1) / 64) + 1) * 64;
-    while (threadsPerBlock > 1024) {
+    while (threadsPerBlock >
+           DeviceRuntime<DeviceType>::GetMaxNumThreadsPerTB()) {
       if (RPerBlock <= 1) {
-        threadsPerBlock = 1024;
+        threadsPerBlock = DeviceRuntime<DeviceType>::GetMaxNumThreadsPerTB();
       } else {
         RPerBlock /= 2;
         numBlocks *= 2;
@@ -218,13 +229,16 @@ class Histogram : public AutoTuner<DeviceType> {
       int threadsPerBlock, numBlocks;
       Config(len, dict_size, RPerBlock, threadsPerBlock, numBlocks);
       Array<1, int, DeviceType> local_histogram_array(
-          {(SIZE)2 * dict_size * numBlocks});
+          {(SIZE)RPerBlock * dict_size * numBlocks}, false, true);
+      local_histogram_array.memset(0);
+      DeviceRuntime<DeviceType>::SyncAllQueues();
       local_histogram = SubArray(local_histogram_array);
       TaskType task =
           GenTask<false>(input_data, local_histogram, output, len, dict_size,
                          RPerBlock, threadsPerBlock, numBlocks, queue_idx);
       DeviceAdapter<TaskType, DeviceType> adapter;
       adapter.Execute(task);
+      DeviceRuntime<DeviceType>::SyncAllQueues();
     }
   }
 };
diff --git a/include/mgard-x/Quantization/LinearQuantization.hpp b/include/mgard-x/Quantization/LinearQuantization.hpp
index 9b66962246..da6ce9d15c 100644
--- a/include/mgard-x/Quantization/LinearQuantization.hpp
+++ b/include/mgard-x/Quantization/LinearQuantization.hpp
@@ -322,7 +322,8 @@ class LevelwiseLinearQuantizeNDFunctor : public Functor<DeviceType> {
         if (quantized_data >= 0 && quantized_data < dict_size) {
           // do nothing
         } else {
-          LENGTH i = Atomic<DeviceType>::Add(outlier_count((IDX)0), (LENGTH)1);
+          LENGTH i = Atomic<LENGTH, AtomicGlobalMemory, AtomicDeviceScope,
+                            DeviceType>::Add(outlier_count((IDX)0), (LENGTH)1);
           *outlier_idx(i) = get_idx<D>(shape_sm, idx);
           *outliers(i) = quantized_data;
           quantized_data = 0;
@@ -445,6 +446,7 @@ class LevelwiseLinearQuantizeND : public AutoTuner<DeviceType> {
     int config = AutoTuner<DeviceType>::autoTuningTable.lwqzk[prec][range_l];
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
 
 #define LWQZK(CONFIG)                                                          \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
@@ -458,21 +460,26 @@ class LevelwiseLinearQuantizeND : public AutoTuner<DeviceType> {
         ranges, l_target, quantizers, volumes, s, huff_dict_size, v, work,     \
         prep_huffman, shape, outlier_count, outlier_idx, outliers, queue_idx); \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
-    LWQZK(0)
-    LWQZK(1)
-    LWQZK(2)
-    LWQZK(3)
-    LWQZK(4)
-    LWQZK(5)
-    LWQZK(6)
+    LWQZK(6) if (!ret.success) config--;
+    LWQZK(5) if (!ret.success) config--;
+    LWQZK(4) if (!ret.success) config--;
+    LWQZK(3) if (!ret.success) config--;
+    LWQZK(2) if (!ret.success) config--;
+    LWQZK(1) if (!ret.success) config--;
+    LWQZK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err
+                << "no suitable config for LevelwiseLinearQuantizeND.\n";
+      exit(-1);
+    }
 #undef LWQZK
     if (AutoTuner<DeviceType>::ProfileKernels) {
       FillAutoTunerTable<DeviceType>("lwqzk", prec, range_l, min_config);
@@ -963,11 +970,11 @@ class LevelwiseLinearDequantizeND : public AutoTuner<DeviceType> {
 
     int range_l = std::min(6, (int)std::log2(v.getShape(0)) - 1);
     int prec = TypeToIdx<T>();
-
     int config = AutoTuner<DeviceType>::autoTuningTable.lwdqzk[prec][range_l];
-
     double min_time = std::numeric_limits<double>::max();
     int min_config = 0;
+    ExecutionReturn ret;
+
 #define LWDQZK(CONFIG)                                                         \
   if (config == CONFIG || AutoTuner<DeviceType>::ProfileKernels) {             \
     const int R = LWPK_CONFIG[D - 1][CONFIG][0];                               \
@@ -980,21 +987,27 @@ class LevelwiseLinearDequantizeND : public AutoTuner<DeviceType> {
         ranges, l_target, quantizers, volumes, s, huff_dict_size, v, work,     \
         prep_huffman, shape, outlier_count, outlier_idx, outliers, queue_idx); \
     DeviceAdapter<TaskType, DeviceType> adapter;                               \
-    ExecutionReturn ret = adapter.Execute(task);                               \
+    ret = adapter.Execute(task);                                               \
     if (AutoTuner<DeviceType>::ProfileKernels) {                               \
-      if (min_time > ret.execution_time) {                                     \
+      if (ret.success && min_time > ret.execution_time) {                      \
         min_time = ret.execution_time;                                         \
         min_config = CONFIG;                                                   \
       }                                                                        \
     }                                                                          \
   }
-    LWDQZK(0)
-    LWDQZK(1)
-    LWDQZK(2)
-    LWDQZK(3)
-    LWDQZK(4)
-    LWDQZK(5)
-    LWDQZK(6)
+
+    LWDQZK(6) if (!ret.success) config--;
+    LWDQZK(5) if (!ret.success) config--;
+    LWDQZK(4) if (!ret.success) config--;
+    LWDQZK(3) if (!ret.success) config--;
+    LWDQZK(2) if (!ret.success) config--;
+    LWDQZK(1) if (!ret.success) config--;
+    LWDQZK(0) if (!ret.success) config--;
+    if (config < 0 && !ret.success) {
+      std::cout << log::log_err
+                << "no suitable config for LevelwiseLinearDequantizeND.\n";
+      exit(-1);
+    }
 #undef LWDQZK
     if (AutoTuner<DeviceType>::ProfileKernels) {
       FillAutoTunerTable<DeviceType>("lwdqzk", prec, range_l, min_config);
@@ -1002,783 +1015,6 @@ class LevelwiseLinearDequantizeND : public AutoTuner<DeviceType> {
   }
 };
 
-// template <DIM D, typename T, int R, int C, int F, bool CALC_VOL>
-// __global__ void
-// _levelwise_linear_quantize(SIZE *shapes, SIZE l_target, T *quantizers, T *
-// volumes, SIZE ldvolumes, T *dv,
-//                            SIZE *ldvs, QUANTIZED_INT *dwork, SIZE *ldws, bool
-//                            prep_huffman, SIZE dict_size, SIZE *shape, LENGTH
-//                            *outlier_count, LENGTH *outlier_idx, QUANTIZED_INT
-//                            *outliers) {
-
-//   size_t threadId = (threadIdx.z * (blockDim.x * blockDim.y)) +
-//                     (threadIdx.y * blockDim.x) + threadIdx.x;
-//   T * smT = SharedMemory<T>();
-//   T * quantizers_sm = smT; smT += l_target + 1;
-
-//   T * volumes_0 = smT; if (CALC_VOL) smT += blockDim.x * (l_target + 1);
-//   T * volumes_1 = smT; if (CALC_VOL) smT += blockDim.y * (l_target + 1);
-//   T * volumes_2 = smT; if (CALC_VOL) smT += blockDim.z * (l_target + 1);
-//   T * volumes_3_plus = smT;
-//   if (CALC_VOL && D > 3) smT += (D-3) * (l_target + 1);
-
-//   SIZE * smInt = (SIZE *)smT;
-//   SIZE *ldvs_sm = smInt; smInt += D;
-//   SIZE *ldws_sm = smInt; smInt += D;
-//   SIZE *shape_sm = smInt; smInt += D;
-//   SIZE *shapes_sm = smInt; smInt += D * (l_target + 2);
-
-//   if (threadId < l_target + 1) {
-//     quantizers_sm[threadId] = quantizers[threadId];
-//   }
-//   if (threadId < D) {
-//     ldvs_sm[threadId] = ldvs[threadId];
-//     ldws_sm[threadId] = ldws[threadId];
-//     shape_sm[threadId] = shape[threadId];
-//     // if (threadId == 0) {
-//     //   printf("%u %u %u %u %u %u\n", shape[0], shape[1], shape[2], ldws[0],
-//     ldws[1], ldws[2]);
-//     // }
-//   }
-//   if (threadId < D * (l_target + 2)) {
-//     shapes_sm[threadId] = shapes[threadId];
-//     // printf ("D: %d l_target+2: %d load shapes[%llu]: %d\n", D, l_target+2,
-//     // threadId, shapes_sm[threadId]);
-//   }
-
-//   __syncthreads();
-
-//   // determine global idx
-//   SIZE idx[D]; //thread global idx
-//   SIZE idx0[D]; //block global idx
-
-//   SIZE firstD = div_roundup(shapes_sm[l_target + 1], F);
-
-//   SIZE bidx = blockIdx.x;
-//   idx[0] = (bidx % firstD) * F + threadIdx.x;
-//   idx0[0] = (bidx % firstD) * F;
-
-//   // printf("shapes_sm[l_target+1]: %d firstD %d idx[0] %d\n",
-//   // shapes_sm[l_target+1], firstD, idx[0]);
-
-//   bidx /= firstD;
-//   if (D >= 2) {
-//     idx[1] = blockIdx.y * blockDim.y + threadIdx.y;
-//     idx0[1] = blockIdx.y * blockDim.y;
-//   }
-//   if (D >= 3) {
-//     idx[2] = blockIdx.z * blockDim.z + threadIdx.z;
-//     idx0[2] = blockIdx.z * blockDim.z;
-//   }
-
-//   for (int d = 3; d < D; d++) {
-//     idx[d] = bidx % shapes_sm[(l_target + 2) * d + l_target + 1];
-//     idx0[d] = idx[d];
-//     bidx /= shapes_sm[(l_target + 2) * d + l_target + 1];
-
-//   }
-
-//   if (CALC_VOL) {
-//     // cache volumes
-//     for (int l = 0; l < l_target+1; l++) {
-//       // volumes 0
-//       if (threadId < blockDim.x && idx0[0] + threadId < shapes_sm[(l_target +
-//       2) * 0 + l_target + 1]) {
-//         volumes_0[l * blockDim.x + threadId] =
-//           volumes[(0 * (l_target + 1) + l) * ldvolumes + idx0[0] + threadId];
-//         // printf("load %f\n", volumes[(0 * (l_target + 1) + l) * ldvolumes +
-//         idx0[0] + threadId]);
-//       }
-//       if (D >= 2) {
-//         // volumes 1
-//         if (threadId < blockDim.y && idx0[1] + threadId < shapes_sm[(l_target
-//         + 2) * 1 + l_target + 1]) {
-//           volumes_1[l * blockDim.y + threadId] =
-//             volumes[(1 * (l_target + 1) + l) * ldvolumes + idx0[1] +
-//             threadId];
-//         }
-//       }
-//       if (D >= 3) {
-//         // volumes 2
-//         if (threadId < blockDim.z && idx0[2] + threadId < shapes_sm[(l_target
-//         + 2) * 2 + l_target + 1]) {
-//           volumes_2[l * blockDim.z + threadId] =
-//             volumes[(2 * (l_target + 1) + l) * ldvolumes + idx0[2] +
-//             threadId];
-//         }
-//       }
-//     }
-
-//     if (D >= 4) {
-//       if (threadId < 1) {
-//         for (int d = 3; d < D; d++) {
-//           for (int l = 0; l < l_target+1; l++) {
-//               volumes_3_plus[(d-3) * (l_target + 1) + l] =
-//                 volumes[(d * (l_target + 1) + l) * ldvolumes + idx[d]];
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   // if (blockIdx.y == 0 && blockIdx.x == 0 && blockIdx.z == 0 && threadId ==
-//   0) {
-//   //   printf("volumes_0: ");
-//   //   for (int l = 0; l < l_target+1; l++) {
-//   //     printf("l = %d\n", l);
-//   //     for (int i = 0; i < min(blockDim.x, shapes_sm[(l_target + 2) * 0 +
-//   l_target + 1]) ; i++) {
-//   //       printf("%f ", volumes_0[l * blockDim.x + i]);
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   //   printf("\n");
-//   //   if (D >= 2) {
-//   //     printf("volumes_1: ");
-//   //     for (int l = 0; l < l_target+1; l++) {
-//   //       printf("l = %d\n", l);
-//   //       for (int i = 0; i < min(blockDim.y, shapes_sm[(l_target + 2) * 1 +
-//   l_target + 1]); i++) {
-//   //         printf("%f ", volumes_1[l * blockDim.y + i]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-
-//   //     printf("\n");
-//   //   }
-//   //   if (D >= 3) {
-//   //     printf("volumes_2: ");
-//   //     for (int l = 0; l < l_target+1; l++) {
-//   //       printf("l = %d\n", l);
-//   //       for (int i = 0; i < min(blockDim.z, shapes_sm[(l_target + 2) * 2 +
-//   l_target + 1]); i++) {
-//   //         printf("%f ", volumes_2[l * blockDim.y + i]);
-//   //       }
-//   //       printf("\n");
-//   //     }
-//   //   }
-//   // }
-
-//   __syncthreads();
-
-//   int level = 0;
-//   for (DIM d = 0; d < D; d++) {
-//     long long unsigned int l_bit = 0l;
-//     for (SIZE l = 0; l < l_target + 1; l++) {
-//       int bit = (idx[d] >= shapes_sm[(l_target + 2) * d + l]) &&
-//                 (idx[d] < shapes_sm[(l_target + 2) * d + l + 1]);
-//       l_bit += bit << l;
-//       // printf("idx: %d %d d: %d l_bit: %llu\n", idx[1], idx[0], d, l_bit);
-//     }
-//     level = max(level, __ffsll(l_bit));
-//   }
-//   level = level - 1;
-
-//   bool in_range = true;
-//   for (DIM d = 0; d < D; d++) {
-//     if (idx[d] >= shapes_sm[(l_target + 2) * d + l_target + 1])
-//       in_range = false;
-//   }
-
-//   // printf("idx %llu, level: %d, in_range: %d idx[0]: shape_sm: %d\n",
-//   // get_idx<D>(shape_sm, idx), level, in_range, shapes_sm[(l_target+2) * 0 +
-//   // l_target+1]);
-
-//   if (level >= 0 && level <= l_target && in_range) {
-//     T t = dv[get_idx<D>(ldvs, idx)];
-//     T volume = 1;
-//     if (CALC_VOL) {
-//       volume *= volumes_0[level * blockDim.x + threadIdx.x];
-//       if (D >= 2) {
-//         volume *= volumes_1[level * blockDim.y + threadIdx.y];
-//       }
-//       if (D >= 3) {
-//         volume *= volumes_2[level * blockDim.z + threadIdx.z];
-//       }
-//       if (D >= 4) {
-//         for (int d = 3; d < D; d++) {
-//           volume *= volumes_3_plus[(d-3) * (l_target + 1) + level];
-//         }
-//       }
-//       if (sizeof(T) == sizeof(double)) volume = sqrt(volume);
-//       else if (sizeof(T) == sizeof(float)) volume = sqrtf(volume);
-//     }
-//     // printf("l: %d, vol %f(%f*%f*%f), quantizers_sm: %f, quantizers: %f,
-//     before: %f, quantized: %d\n", level, volume,
-//     //   volumes_0[level * blockDim.x + threadIdx.x], volumes_1[level *
-//     blockDim.y + threadIdx.y], volumes_2[level * blockDim.z + threadIdx.z],
-//     //   quantizers_sm[level],
-//     //   (quantizers_sm[level] / volume), t, (int)copysign(0.5 + fabs(t /(
-//     quantizers_sm[level] / volume)), t));
-
-//     QUANTIZED_INT quantized_data = copysign(0.5 + fabs(t /
-//     (quantizers_sm[level] * volume) ), t);
-//     // QUANTIZED_INT quantized_data = copysign(0.5 + fabs(t /
-//     (quantizers_sm[level] / volume) ), t);
-//     // printf("dv[%llu] %f quantizers[%d]%f -> dw[%llu]%d \n",
-//     //       get_idx<D>(ldvs, idx), t,
-//     //       level, quantizers_sm[level],
-//     //       get_idx<D>(ldws, idx), quantized_data+dict_size / 2);
-
-//     if (prep_huffman) {
-//       quantized_data += dict_size / 2;
-//       if (quantized_data >= 0 && quantized_data < dict_size) {
-//         // do nothing
-//       } else {
-//         LENGTH i = atomicAdd(outlier_count, (LENGTH)1);
-//         outlier_idx[i] = get_idx<D>(shape_sm, idx);
-//         outliers[i] = quantized_data;
-//         quantized_data = 0;
-//       }
-//       // if (get_idx<D>(shape_sm, idx) < quant_meta_size_ratio) {
-//       //   size_t i = atomicAdd((unsigned long long int*)outlier_count,
-//       //   (unsigned long long int)1); outlier_idx[i] = get_idx<D>(shape_sm,
-//       //   idx);
-//       // }
-//     }
-
-//     dwork[get_idx<D>(ldws_sm, idx)] = quantized_data;
-//   }
-// }
-
-// template <DIM D, typename T, int R, int C, int F>
-// void levelwise_linear_quantize_adaptive_launcher(
-//     Handle<D, T> &handle, SIZE *shapes, SIZE l_target, T *volumes,
-//     SIZE ldvolumes, Metadata &m, T *dv, SIZE *ldvs, QUANTIZED_INT *dwork,
-//     SIZE *ldws, bool prep_huffman, SIZE *shape, LENGTH *outlier_count,
-//     LENGTH *outlier_idx, QUANTIZED_INT *outliers, int queue_idx) {
-
-//   T *quantizers = new T[l_target + 1];
-//   calc_quantizers(handle, quantizers, m, false);
-//   cudaMemcpyAsyncHelper(handle, handle.quantizers, quantizers,
-//                         sizeof(T) * (l_target + 1), H2D, queue_idx);
-
-//   // printf("norm: %f, tol: %f, s: %f, dict_size: %d\n", m.norm, m.tol, m.s,
-//   // m.dict_size);
-//   int total_thread_z = handle.dofs[2][0];
-//   int total_thread_y = handle.dofs[1][0];
-//   int total_thread_x = handle.dofs[0][0];
-//   // linearize other dimensions
-//   int tbz = R;
-//   int tby = C;
-//   int tbx = F;
-//   int gridz = ceil((float)total_thread_z / tbz);
-//   int gridy = ceil((float)total_thread_y / tby);
-//   int gridx = ceil((float)total_thread_x / tbx);
-//   for (int d = 3; d < D; d++) {
-//     gridx *= handle.dofs[d][0];
-//   }
-
-//   // printf("exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx, gridy, gridz);
-//   dim3 threadsPerBlock(tbx, tby, tbz);
-//   dim3 blockPerGrid(gridx, gridy, gridz);
-//   // ldvs + ldws + shape
-//   size_t sm_size = (D * 3) * sizeof(SIZE);
-//   // quantizer
-//   sm_size += (l_target + 1) * sizeof(T);
-//   // ranges
-//   sm_size += (l_target + 2) * D * sizeof(SIZE);
-//   // volumes
-//   sm_size += tbx * (l_target + 1) * sizeof(T);
-//   sm_size += tby * (l_target + 1) * sizeof(T);
-//   sm_size += tbz * (l_target + 1) * sizeof(T);
-//   if (D > 3) sm_size += (D-3) * (l_target + 1) * sizeof(T);
-//   // printf("sm_size: %llu\n", sm_size);
-//   if (m.ntype == norm_type::L_Inf) {
-//     _levelwise_linear_quantize<D, T, R, C, F, false>
-//         <<<blockPerGrid, threadsPerBlock, sm_size,
-//            *(cudaStream_t *)handle.get(queue_idx)>>>(
-//             shapes, l_target, handle.quantizers, volumes, ldvolumes, dv,
-//             ldvs, dwork, ldws, prep_huffman, m.huff_dict_size, shape,
-//             outlier_count, outlier_idx, outliers);
-//   } else if (m.ntype == norm_type::L_2) {
-//     _levelwise_linear_quantize<D, T, R, C, F, true>
-//         <<<blockPerGrid, threadsPerBlock, sm_size,
-//            *(cudaStream_t *)handle.get(queue_idx)>>>(
-//             shapes, l_target, handle.quantizers, volumes, ldvolumes, dv,
-//             ldvs, dwork, ldws, prep_huffman, m.huff_dict_size, shape,
-//             outlier_count, outlier_idx, outliers);
-//   } else {
-//     std::cout << log::log_err << "unsupported norm type!\n";
-//     exit(-1);
-//   }
-
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void levelwise_linear_quantize(Handle<D, T> &handle, SIZE *shapes,
-//                                SIZE l_target, T *volumes, SIZE ldvolumes,
-//                                Metadata &m, T *dv, SIZE *ldvs,
-//                                QUANTIZED_INT *dwork, SIZE *ldws,
-//                                bool prep_huffman, SIZE *shape,
-//                                LENGTH *outlier_count, LENGTH *outlier_idx,
-//                                QUANTIZED_INT *outliers, int queue_idx) {
-//   #define QUANTIZE(R, C, F) \
-//   { \
-//     levelwise_linear_quantize_adaptive_launcher<D, T, R, C, F>( \
-//         handle, shapes, l_target, volumes, ldvolumes, m, dv, ldvs, dwork,
-//         ldws, prep_huffman,      \
-//         shape, outlier_count, outlier_idx, outliers, queue_idx); \
-//   }
-
-//   if (D >= 3) {
-//     QUANTIZE(4, 4, 16)
-//   }
-//   if (D == 2) {
-//     QUANTIZE(1, 4, 32)
-//   }
-//   if (D == 1) {
-//     QUANTIZE(1, 1, 64)
-//   }
-//   #undef QUANTIZE
-// }
-
-// template <DIM D, typename T, int R, int C, int F, bool CALC_VOL>
-// __global__ void
-// _levelwise_linear_dequantize(SIZE *shapes, SIZE l_target, T *quantizers, T *
-// volumes, SIZE ldvolumes, QUANTIZED_INT *dv,
-//                              SIZE *ldvs, T *dwork, SIZE *ldws, bool
-//                              prep_huffman, SIZE dict_size, LENGTH
-//                              outlier_count, LENGTH *outlier_idx,
-//                              QUANTIZED_INT *outliers) {
-
-//   LENGTH threadId = (threadIdx.z * (blockDim.x * blockDim.y)) +
-//                     (threadIdx.y * blockDim.x) + threadIdx.x;
-//   LENGTH blockId = (blockIdx.z * (gridDim.x * gridDim.y)) +
-//                    (blockIdx.y * gridDim.x) + blockIdx.x;
-//   LENGTH gloablId = blockId * blockDim.x * blockDim.y * blockDim.z +
-//   threadId;
-
-//   T * smT = SharedMemory<T>();
-//   T * quantizers_sm = smT; smT += l_target + 1;
-//   T * volumes_0 = smT; if (CALC_VOL) smT += blockDim.x * (l_target + 1);
-//   T * volumes_1 = smT; if (CALC_VOL) smT += blockDim.y * (l_target + 1);
-//   T * volumes_2 = smT; if (CALC_VOL) smT += blockDim.z * (l_target + 1);
-//   T * volumes_3_plus = smT;
-//   if (CALC_VOL && D > 3) smT += (D-3) * (l_target + 1);
-
-//   SIZE * smInt = (SIZE *)smT;
-//   SIZE *ldvs_sm = smInt; smInt += D;
-//   SIZE *ldws_sm = smInt; smInt += D;
-//   SIZE *shape_sm = smInt; smInt += D;
-//   SIZE *shapes_sm = smInt; smInt += D * (l_target + 2);
-
-//   if (threadId < l_target + 1) {
-//     quantizers_sm[threadId] = quantizers[threadId];
-//   }
-//   if (threadId < D) {
-//     ldvs_sm[threadId] = ldvs[threadId];
-//     ldws_sm[threadId] = ldws[threadId];
-//   }
-//   if (threadId < D * (l_target + 2)) {
-//     shapes_sm[threadId] = shapes[threadId];
-//   }
-
-//   __syncthreads();
-
-//   // bool debug = false;
-//   // if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&
-//   //     threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
-//   //   debug = true;
-//     // for (int d = 0; d < D; d ++) {
-//     //   printf("shapes_sm[%d]\n", d);
-//     //   for (int l = 0; l < l_target + 1; l++) {
-//     //     printf("%d ", shapes_sm[(l_target+1) * d + l]);
-//     //   }
-//     //   printf("\n");
-//     // }
-//   // }
-//   // __syncthreads();
-
-//   // determine global idx
-//   SIZE idx[D]; //thread global idx
-//   SIZE idx0[D]; //block global idx
-
-//   SIZE firstD = div_roundup(shapes_sm[l_target + 1], F);
-
-//   SIZE bidx = blockIdx.x;
-//   idx[0] = (bidx % firstD) * F + threadIdx.x;
-//   idx0[0] = (bidx % firstD) * F;
-
-//   // printf("shapes_sm[l_target+1]: %d firstD %d idx[0] %d\n",
-//   // shapes_sm[l_target+1], firstD, idx[0]);
-
-//   bidx /= firstD;
-//   if (D >= 2) {
-//     idx[1] = blockIdx.y * blockDim.y + threadIdx.y;
-//     idx0[1] = blockIdx.y * blockDim.y;
-//   }
-//   if (D >= 3) {
-//     idx[2] = blockIdx.z * blockDim.z + threadIdx.z;
-//     idx0[2] = blockIdx.z * blockDim.z;
-//   }
-
-//   for (DIM d = 3; d < D; d++) {
-//     idx[d] = bidx % shapes_sm[(l_target + 2) * d + l_target + 1];
-//     idx0[d] = idx[d];
-//     bidx /= shapes_sm[(l_target + 2) * d + l_target + 1];
-
-//   }
-
-//   if (CALC_VOL) {
-//     // cache volumes
-//     for (SIZE l = 0; l < l_target+1; l++) {
-//       // volumes 0
-//       if (threadId < blockDim.x && idx0[0] + threadId < shapes_sm[(l_target +
-//       2) * 0 + l_target + 1]) {
-//         // printf("%d < %d[%d, %d, %d]\n", idx0[0] + (int)threadId,
-//         //   shapes_sm[(l_target + 2) * 0 + l_target + 1],
-//         //   l_target, (l_target + 2) * 0 + l_target + 1, l_target + 2);
-//         volumes_0[l * blockDim.x + threadId] =
-//           volumes[(0 * (l_target + 1) + l) * ldvolumes + idx0[0] + threadId];
-//         // printf("load %f\n", volumes_0[l * blockDim.x + threadId]);
-//       }
-//       if (D >= 2) {
-//         // volumes 1
-//         if (threadId < blockDim.y && idx0[1] + threadId < shapes_sm[(l_target
-//         + 2) * 1 + l_target + 1]) {
-//           volumes_1[l * blockDim.y + threadId] =
-//             volumes[(1 * (l_target + 1) + l) * ldvolumes + idx0[1] +
-//             threadId];
-//         }
-//       }
-//       if (D >= 3) {
-//         // volumes 2
-//         if (threadId < blockDim.z && idx0[2] + threadId < shapes_sm[(l_target
-//         + 2) * 2 + l_target + 1]) {
-//           volumes_2[l * blockDim.z + threadId] =
-//             volumes[(2 * (l_target + 1) + l) * ldvolumes + idx0[2] +
-//             threadId];
-//         }
-//       }
-//     }
-
-//     if (D >= 4) {
-//       if (threadId < 1) {
-//         for (DIM d = 3; d < D; d++) {
-//           for (SIZE l = 0; l < l_target+1; l++) {
-//               volumes_3_plus[(d-3) * (l_target + 1) + l] =
-//                 volumes[(d * (l_target + 1) + l) * ldvolumes + idx[d]];
-//           }
-//         }
-//       }
-//     }
-//   }
-
-//   // if (blockIdx.y == 0 && blockIdx.x == 0 && threadId == 0) {
-//   //   printf("volumes_0: ");
-//   //   for (int l = 0; l < l_target+1; l++) {
-//   //     printf("l = %d\n", l);
-//   //     for (int i = 0; i < min(blockDim.x, shapes_sm[(l_target + 2) * 0 +
-//   l_target + 1]) ; i++) {
-//   //       printf("%f ", volumes_0[l * blockDim.x + i]);
-//   //     }
-//   //     printf("\n");
-//   //   }
-//   //   printf("\n");
-//   //   printf("volumes_1: ");
-//   //   for (int l = 0; l < l_target+1; l++) {
-//   //     printf("l = %d\n", l);
-//   //     for (int i = 0; i < min(blockDim.y, shapes_sm[(l_target + 2) * 1 +
-//   l_target + 1]); i++) {
-//   //       printf("%f ", volumes_1[l * blockDim.y + i]);
-//   //     }
-//   //     printf("\n");
-//   //   }
-
-//   // }
-
-//   __syncthreads();
-
-//   int level = 0;
-//   for (DIM d = 0; d < D; d++) {
-//     long long unsigned int l_bit = 0l;
-//     for (SIZE l = 0; l < l_target + 1; l++) {
-//       int bit = (idx[d] >= shapes_sm[(l_target + 2) * d + l]) &&
-//                 (idx[d] < shapes_sm[(l_target + 2) * d + l + 1]);
-//       l_bit += bit << l;
-//       // printf("idx: %d %d d: %d l_bit: %llu\n", idx[1], idx[0], d, l_bit);
-//     }
-//     level = max(level, __ffsll(l_bit));
-//   }
-
-//   bool in_range = true;
-//   for (DIM d = 0; d < D; d++) {
-//     if (idx[d] >= shapes_sm[(l_target + 2) * d + l_target + 1])
-//       in_range = false;
-//   }
-
-//   level = level - 1;
-//   if (level >= 0 && level <= l_target && in_range) {
-//     // printf("%d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]);
-//     // printf("idx: %d %d l: %d\n", idx[1], idx[0], level);
-//     QUANTIZED_INT quantized_data = dv[get_idx<D>(ldvs, idx)];
-//     T volume = 1;
-//     if (CALC_VOL) {
-//       volume *= volumes_0[level * blockDim.x + threadIdx.x];
-//       if (D >= 2) volume *= volumes_1[level * blockDim.y + threadIdx.y];
-//       if (D >= 3) volume *= volumes_2[level * blockDim.z + threadIdx.z];
-//       if (D >= 4) {
-//         for (int d = 3; d < D; d++) {
-//           volume *= volumes_3_plus[(d-3) * (l_target + 1) + level];
-//         }
-//       }
-//       if (sizeof(T) == sizeof(double)) volume = sqrt(volume);
-//       else if (sizeof(T) == sizeof(float)) volume = sqrtf(volume);
-//     }
-
-//     if (prep_huffman) {
-//       quantized_data -= dict_size / 2;
-//     }
-
-//     // printf("%d %d %d %d %d %d vol %f (%f * %f * %f), dequantizers: %f,
-//     before: %d, dequantized: %f\n", blockIdx.z, blockIdx.y, blockIdx.x,
-//     threadIdx.z, threadIdx.y, threadIdx.x, volume,
-//     //   volumes_0[level * blockDim.x + threadIdx.x], volumes_1[level *
-//     blockDim.y + threadIdx.y], volumes_2[level * blockDim.z + threadIdx.z],
-//     //   quantizers_sm[level] / volume, quantized_data, (quantizers_sm[level]
-//     / volume) * (T)quantized_data); dwork[get_idx<D>(ldws, idx)] =
-//     (quantizers_sm[level] * volume) * (T)quantized_data;
-//     // dwork[get_idx<D>(ldws, idx)] = (quantizers_sm[level] / volume) *
-//     (T)quantized_data;
-//     // dwork[get_idx<D>(ldws, idx)] = (T)dv[get_idx<D>(ldvs, idx)];
-
-//     // printf("dw[%llu] %d dequantizers[%d]%f -> dw[%llu]%f \n",
-//     // get_idx<D>(ldvs, idx),
-//     //       quantized_data, level, quantizers_sm[level], get_idx<D>(ldws,
-//     idx),
-//     //       quantizers_sm[level] * (T)quantized_data);
-//   }
-
-//   // //outliers
-//   // if (gloablId < outlier_count) {
-//   //   size_t linerized_idx = outlier_idx[gloablId];
-//   //   for (int d = 0; d < D; d++) {
-//   //     idx[d] = linerized_idx % shapes_sm[(l_target+2) * d+l_target+1];
-//   //     linerized_idx /= shapes_sm[(l_target+2) * d+l_target+1];
-//   //   }
-//   //   int outliter = outliers[gloablId];
-//   //   outliter -= dict_size / 2;
-
-//   //   level = 0;
-//   //   for (int d = 0; d < D; d++) {
-//   //     long long unsigned int l_bit = 0l;
-//   //     for (int l = 0; l < l_target+1; l++) {
-//   //       int bit = (idx[d] >= shapes_sm[(l_target+2) * d + l]) && (idx[d] <
-//   //       shapes_sm[(l_target+2) * d + l+1]); l_bit += bit << l;
-//   //       // printf("idx: %d %d d: %d l_bit: %llu\n", idx[1], idx[0], d,
-//   //       l_bit);
-//   //     }
-//   //     level = max(level, __ffsll(l_bit));
-//   //   }
-//   //   level = level - 1;
-
-//   //   dwork[get_idx<D>(ldws, idx)] = quantizers_sm[level] * (T)outliter;
-
-//   //   // printf("outliter: dw[%llu] %d dequantizers[%d]%f -> dw[%llu]%f \n",
-//   //   get_idx<D>(ldvs, idx),
-//   //   //       outliter, level, quantizers_sm[level], get_idx<D>(ldws, idx),
-//   //   quantizers_sm[level] * (T)outliter);
-
-//   // }
-// }
-
-// template <DIM D, typename T, int R, int C, int F, bool CALC_VOL>
-// __global__ void _levelwise_linear_dequantize_outliers(
-//     SIZE *shapes, SIZE l_target, T *quantizers, T * volumes, SIZE ldvolumes,
-//     QUANTIZED_INT *dv, SIZE *ldvs, T *dwork, SIZE *ldws, SIZE dict_size,
-//     LENGTH outlier_count, LENGTH *outlier_idx, QUANTIZED_INT *outliers) {
-
-//   size_t threadId = (threadIdx.z * (blockDim.x * blockDim.y)) +
-//                     (threadIdx.y * blockDim.x) + threadIdx.x;
-//   size_t blockId = (blockIdx.z * (gridDim.x * gridDim.y)) +
-//                    (blockIdx.y * gridDim.x) + blockIdx.x;
-//   size_t gloablId = blockId * blockDim.x * blockDim.y * blockDim.z +
-//   threadId;
-
-//   T *sm = SharedMemory<T>();
-//   T *quantizers_sm = sm; sm += l_target + 1;
-
-//   SIZE *sm_size = (SIZE*)sm;
-//   SIZE *ldvs_sm = sm_size; sm_size += D;
-//   SIZE *ldws_sm = sm_size; sm_size += D;
-//   SIZE *shapes_sm = sm_size; sm_size += D * (l_target + 2);
-
-//   if (threadId < l_target + 1) {
-//     quantizers_sm[threadId] = quantizers[threadId];
-//   }
-//   if (threadId < D) {
-//     ldvs_sm[threadId] = ldvs[threadId];
-//     ldws_sm[threadId] = ldws[threadId];
-//   }
-//   if (threadId < D * (l_target + 2)) {
-//     shapes_sm[threadId] = shapes[threadId];
-//   }
-
-//   __syncthreads();
-//   SIZE idx[D]; //thread global idx
-
-//   // outliers
-//   if (gloablId < outlier_count) {
-//     size_t linerized_idx = outlier_idx[gloablId];
-//     // for (DIM d = 0; d < D; d++) {
-//     //   idx[d] = linerized_idx % shapes_sm[(l_target + 2) * d + l_target +
-//     1];
-//     //   linerized_idx /= shapes_sm[(l_target + 2) * d + l_target + 1];
-//     // }
-//     QUANTIZED_INT outliter = outliers[gloablId];
-
-//     dv[linerized_idx] = outliter;
-//     // printf("put back[%llu] <- outlier[%llu]: %llu\n", linerized_idx,
-//     gloablId, outliter);
-
-//     // outliter -= dict_size / 2;
-
-//     // int level = 0;
-//     // for (DIM d = 0; d < D; d++) {
-//     //   long long unsigned int l_bit = 0l;
-//     //   for (SIZE l = 0; l < l_target + 1; l++) {
-//     //     int bit = (idx[d] >= shapes_sm[(l_target + 2) * d + l]) &&
-//     //               (idx[d] < shapes_sm[(l_target + 2) * d + l + 1]);
-//     //     l_bit += bit << l;
-//     //     // printf("idx: %d %d d: %d l_bit: %llu\n", idx[1], idx[0], d,
-//     l_bit);
-//     //   }
-//     //   level = max(level, __ffsll(l_bit));
-//     // }
-//     // level = level - 1;
-
-//     // T volume = 1;
-
-//     // if (CALC_VOL) {
-//     //   for (DIM d = 0; d < D; d++) {
-//     //     volume *= volumes[(d * (l_target+1) + level) * ldvolumes +
-//     idx[d]];
-//     //   }
-//     //   if (sizeof(T) == sizeof(double)) volume = sqrt(volume);
-//     //   else if (sizeof(T) == sizeof(float)) volume = sqrtf(volume);
-//     // }
-//     // dwork[get_idx<D>(ldws, idx)] = (quantizers_sm[level] * volume) *
-//     (T)outliter;
-//   }
-// }
-
-// template <DIM D, typename T, int R, int C, int F>
-// void levelwise_linear_dequantize_adaptive_launcher(
-//     Handle<D, T> &handle, SIZE *shapes, SIZE l_target, T *volumes,
-//     SIZE ldvolumes, Metadata &m, QUANTIZED_INT *dv, SIZE *ldvs, T *dwork,
-//     SIZE *ldws, bool prep_huffman, LENGTH outlier_count, LENGTH *outlier_idx,
-//     QUANTIZED_INT *outliers, int queue_idx) {
-
-//   // printf("norm: %f, tol: %f, s: %f, dict_size: %d\n", m.norm, m.tol, m.s,
-//   // m.dict_size);
-
-//   T *quantizers = new T[l_target + 1];
-//   calc_quantizers(handle, quantizers, m, false);
-//   cudaMemcpyAsyncHelper(handle, handle.quantizers, quantizers,
-//                         sizeof(T) * (l_target + 1), H2D, queue_idx);
-
-//   SIZE total_thread_z = handle.dofs[2][0];
-//   SIZE total_thread_y = handle.dofs[1][0];
-//   SIZE total_thread_x = handle.dofs[0][0];
-//   // linearize other dimensions
-//   SIZE tbz = R;
-//   SIZE tby = C;
-//   SIZE tbx = F;
-//   SIZE gridz = ceil((float)total_thread_z / tbz);
-//   SIZE gridy = ceil((float)total_thread_y / tby);
-//   SIZE gridx = ceil((float)total_thread_x / tbx);
-//   for (DIM d = 3; d < D; d++) {
-//     gridx *= handle.dofs[d][0];
-//   }
-
-//   // printf("exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx, gridy, gridz);
-//   dim3 threadsPerBlock(tbx, tby, tbz);
-//   dim3 blockPerGrid(gridx, gridy, gridz);
-//   size_t sm_size = (D * 3) * sizeof(SIZE);
-//   sm_size += (l_target + 1) * sizeof(T);
-//   sm_size += (l_target + 2) * D * sizeof(SIZE);
-//   sm_size += tbx * (l_target + 1) * sizeof(T);
-//   sm_size += tby * (l_target + 1) * sizeof(T);
-//   sm_size += tbz * (l_target + 1) * sizeof(T);
-//   if (D > 3) sm_size += (D-3) * (l_target + 1) * sizeof(T);
-
-//   if (m.ntype == norm_type::L_Inf) {
-//     if (prep_huffman) {
-//       _levelwise_linear_dequantize_outliers<D, T, R, C, F, false>
-//           <<<blockPerGrid, threadsPerBlock, sm_size,
-//              *(cudaStream_t *)handle.get(queue_idx)>>>(
-//               shapes, l_target, handle.quantizers, volumes, ldvolumes, dv,
-//               ldvs, dwork, ldws, m.huff_dict_size, outlier_count,
-//               outlier_idx, outliers);
-//     }
-//     gpuErrchk(cudaDeviceSynchronize());
-//     _levelwise_linear_dequantize<D, T, R, C, F, false>
-//         <<<blockPerGrid, threadsPerBlock, sm_size,
-//            *(cudaStream_t *)handle.get(queue_idx)>>>(
-//             shapes, l_target, handle.quantizers, volumes, ldvolumes, dv,
-//             ldvs, dwork, ldws, prep_huffman, m.huff_dict_size, outlier_count,
-//             outlier_idx, outliers);
-//     gpuErrchk(cudaDeviceSynchronize());
-//   } else if (m.ntype == norm_type::L_2){
-//     if (prep_huffman) {
-//       _levelwise_linear_dequantize_outliers<D, T, R, C, F, true>
-//           <<<blockPerGrid, threadsPerBlock, sm_size,
-//              *(cudaStream_t *)handle.get(queue_idx)>>>(
-//               shapes, l_target, handle.quantizers, volumes, ldvolumes, dv,
-//               ldvs, dwork, ldws, m.huff_dict_size, outlier_count,
-//               outlier_idx, outliers);
-//     }
-//     gpuErrchk(cudaDeviceSynchronize());
-//     _levelwise_linear_dequantize<D, T, R, C, F, true>
-//         <<<blockPerGrid, threadsPerBlock, sm_size,
-//            *(cudaStream_t *)handle.get(queue_idx)>>>(
-//             shapes, l_target, handle.quantizers, volumes, ldvolumes, dv,
-//             ldvs, dwork, ldws, prep_huffman, m.huff_dict_size, outlier_count,
-//             outlier_idx, outliers);
-//     gpuErrchk(cudaDeviceSynchronize());
-//   } else {
-//     std::cout << log::log_err << "unsupported norm type!\n";
-//     exit(-1);
-//   }
-//   gpuErrchk(cudaGetLastError());
-//   if (handle.sync_and_check_all_kernels) {
-//     gpuErrchk(cudaDeviceSynchronize());
-//   }
-// }
-
-// template <DIM D, typename T>
-// void levelwise_linear_dequantize(Handle<D, T> &handle, SIZE *shapes,
-//                                  SIZE l_target, T *volumes, SIZE ldvolumes,
-//                                  Metadata &m, QUANTIZED_INT *dv, SIZE *ldvs,
-//                                  T *dwork, SIZE *ldws, bool prep_huffman,
-//                                  LENGTH outlier_count, LENGTH *outlier_idx,
-//                                  QUANTIZED_INT *outliers, int queue_idx) {
-//   #define DEQUANTIZE(R, C, F) \
-//   { \
-//     levelwise_linear_dequantize_adaptive_launcher<D, T, R, C, F>( \
-//         handle, shapes, l_target, volumes, ldvolumes, m, dv, ldvs, dwork,
-//         ldws, prep_huffman, outlier_count,     \
-//         outlier_idx, outliers, queue_idx); \
-//   }
-
-//   if (D >= 3) {
-//     DEQUANTIZE(4, 4, 16)
-//   }
-//   if (D == 2) {
-//     DEQUANTIZE(1, 4, 32)
-//   }
-//   if (D == 1) {
-//     DEQUANTIZE(1, 1, 64)
-//   }
-
-//   #undef DEQUANTIZE
-// }
-
 } // namespace mgard_x
 
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/RuntimeX/AutoTuners/AutoTuner.h b/include/mgard-x/RuntimeX/AutoTuners/AutoTuner.h
index 9afa942650..d7bfae1485 100644
--- a/include/mgard-x/RuntimeX/AutoTuners/AutoTuner.h
+++ b/include/mgard-x/RuntimeX/AutoTuners/AutoTuner.h
@@ -192,12 +192,17 @@ MGARDX_CONT void FillAutoTunerTable(std::string kernel_name, int precision_idx,
                                     int range_l, int config) {
 
   std::string device_type_string = "";
-  if (std::is_same<DeviceType, Serial>::value) {
+  if (std::is_same<DeviceType, SERIAL>::value) {
     device_type_string = "Serial";
   } else if (std::is_same<DeviceType, CUDA>::value) {
     device_type_string = "Cuda";
   } else if (std::is_same<DeviceType, HIP>::value) {
     device_type_string = "Hip";
+  } else if (std::is_same<DeviceType, SYCL>::value) {
+    device_type_string = "Sycl";
+  } else {
+    std::cout << log::log_err << "invalid device_type in FillAutoTunerTable.\n";
+    exit(-1);
   }
 
   string curr_file_path = __FILE__;
@@ -282,11 +287,21 @@ template <typename DeviceType> class AutoTuner {
   static AutoTuningTable<DeviceType> autoTuningTable;
   static bool ProfileKenrles;
 };
+
+template <typename DeviceType> void BeginAutoTuning() {
+  AutoTuner<DeviceType>::ProfileKernels = true;
+}
+
+template <typename DeviceType> void EndAutoTuning() {
+  AutoTuner<DeviceType>::ProfileKernels = false;
+}
+
 } // namespace mgard_x
 
 #include "AutoTunerCuda.h"
 #include "AutoTunerHip.h"
 #include "AutoTunerKokkos.h"
 #include "AutoTunerSerial.h"
+#include "AutoTunerSycl.h"
 
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.h b/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.h
index c82b0e814e..c59f8835eb 100644
--- a/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.h
+++ b/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.h
@@ -10,7 +10,7 @@
 
 namespace mgard_x {
 
-template <> class AutoTuningTable<Serial> {
+template <> class AutoTuningTable<SERIAL> {
 public:
   static const int num_precision = 2;
   static const int num_range = 9;
@@ -56,11 +56,11 @@ template <> class AutoTuningTable<Serial> {
   static int llk[num_precision][num_range];
 };
 
-template <> class AutoTuner<Serial> {
+template <> class AutoTuner<SERIAL> {
 public:
   MGARDX_CONT
   AutoTuner(){};
-  static AutoTuningTable<Serial> autoTuningTable;
+  static AutoTuningTable<SERIAL> autoTuningTable;
   static bool ProfileKernels;
 };
 
diff --git a/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.h b/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.h
new file mode 100644
index 0000000000..28c23e8562
--- /dev/null
+++ b/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#ifndef MGARD_X_AUTOTUNER_SYCL_H
+#define MGARD_X_AUTOTUNER_SYCL_H
+
+namespace mgard_x {
+
+template <> class AutoTuningTable<SYCL> {
+public:
+  static const int num_precision = 2;
+  static const int num_range = 9;
+
+  static int gpk_reo_3d[num_precision][num_range];
+
+  static int gpk_rev_3d[num_precision][num_range];
+
+  static int gpk_reo_nd[num_precision][num_range];
+
+  static int gpk_rev_nd[num_precision][num_range];
+
+  static int lpk1_3d[num_precision][num_range];
+
+  static int lpk2_3d[num_precision][num_range];
+
+  static int lpk3_3d[num_precision][num_range];
+
+  static int lpk1_nd[num_precision][num_range];
+
+  static int lpk2_nd[num_precision][num_range];
+
+  static int lpk3_nd[num_precision][num_range];
+
+  static int ipk1_3d[num_precision][num_range];
+
+  static int ipk2_3d[num_precision][num_range];
+
+  static int ipk3_3d[num_precision][num_range];
+
+  static int ipk1_nd[num_precision][num_range];
+
+  static int ipk2_nd[num_precision][num_range];
+
+  static int ipk3_nd[num_precision][num_range];
+
+  static int lwpk[num_precision][num_range];
+
+  static int lwqzk[num_precision][num_range];
+
+  static int lwdqzk[num_precision][num_range];
+
+  static int llk[num_precision][num_range];
+};
+
+template <> class AutoTuner<SYCL> {
+public:
+  MGARDX_CONT
+  AutoTuner(){};
+  static AutoTuningTable<SYCL> autoTuningTable;
+  static bool ProfileKernels;
+};
+
+} // namespace mgard_x
+
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt b/include/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt
index be80fce9c1..507bd33418 100644
--- a/include/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt
+++ b/include/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt
@@ -3,5 +3,6 @@ list(APPEND MGARD_X_HEADER
     ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerSerial.h
     ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerCuda.h
     ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerHip.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerSycl.h
     )
 set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE)
\ No newline at end of file
diff --git a/include/mgard-x/RuntimeX/DataStructures/SubArray.hpp b/include/mgard-x/RuntimeX/DataStructures/SubArray.hpp
index 07d33f78d0..5003a245bf 100644
--- a/include/mgard-x/RuntimeX/DataStructures/SubArray.hpp
+++ b/include/mgard-x/RuntimeX/DataStructures/SubArray.hpp
@@ -17,14 +17,12 @@ template <DIM D, typename T, typename DeviceType> class SubArray {
 public:
   MGARDX_CONT_EXEC
   SubArray();
+
   MGARDX_CONT
   SubArray(Array<D, T, DeviceType> &array, bool get_host_pointer = false);
+
   MGARDX_CONT
   SubArray(std::vector<SIZE> shape, T *dv);
-  MGARDX_CONT_EXEC
-  SubArray(SubArray<D, T, DeviceType> &subArray);
-  MGARDX_CONT_EXEC
-  SubArray(const SubArray<D, T, DeviceType> &subArray);
 
   MGARDX_CONT_EXEC
   T *data() { return this->dv; }
@@ -93,18 +91,12 @@ template <DIM D, typename T, typename DeviceType> class SubArray {
   MGARDX_CONT_EXEC
   void setPitched(bool pitched) { this->pitched = pitched; }
 
-  // MGARDX_CONT_EXEC
-  // SIZE * getLdd() { return this->ldvs_d; }
-
   MGARDX_CONT_EXEC
   SIZE getLddv1() const { return this->lddv1; }
 
   MGARDX_CONT_EXEC
   SIZE getLddv2() const { return this->lddv2; }
 
-  MGARDX_CONT_EXEC
-  SubArray<D, T, DeviceType> &
-  operator=(const SubArray<D, T, DeviceType> &subArray);
   void offset(std::vector<SIZE> idx);
 
   MGARDX_CONT
@@ -136,6 +128,11 @@ template <DIM D, typename T, typename DeviceType> class SubArray {
     return this->dv + offset;
   }
 
+  MGARDX_CONT_EXEC
+  T *operator()(IDX l, IDX z, IDX y, IDX x) {
+    return this->dv + this->_ldvs[2] * this->_ldvs[1] * this->_ldvs[0] * l +
+           this->_ldvs[1] * this->_ldvs[0] * z + this->_ldvs[0] * y + x;
+  }
   MGARDX_CONT_EXEC
   T *operator()(IDX z, IDX y, IDX x) {
     return this->dv + this->lddv2 * this->lddv1 * z + this->lddv1 * y + x;
@@ -174,21 +171,15 @@ template <DIM D, typename T, typename DeviceType> class SubArray {
   MGARDX_CONT_EXEC
   bool isNull() { return this->dv == NULL; }
 
-  MGARDX_CONT_EXEC
-  ~SubArray();
-
   using DataType = T;
   using DevType = DeviceType;
   static const DIM NumDims = D;
 
 private:
-  // std::vector<SIZE> shape;
-  T *dv; // device pointer
-  T *v;  // host pointer
+  T *dv = NULL; // device pointer
+  T *v = NULL;  // host pointer
   bool has_host_pointer = false;
 
-  // std::vector<SIZE> ldvs_h;
-  // SIZE *ldvs_d;
   SIZE _ldvs[D];
   SIZE _shape[D];
 
@@ -224,10 +215,7 @@ MGARDX_CONT_EXEC SubArray<D, T, DeviceType>::SubArray() {
 template <DIM D, typename T, typename DeviceType>
 MGARDX_CONT SubArray<D, T, DeviceType>::SubArray(Array<D, T, DeviceType> &array,
                                                  bool get_host_pointer) {
-  // this->shape  = array.getShape();
   this->dv = array.data();
-  // this->ldvs_h = array.ld();
-  // this->ldvs_d = array.get_ldvs_d();
 
   for (DIM d = 0; d < D; d++) {
     this->_shape[d] = array.shape()[d];
@@ -245,9 +233,7 @@ MGARDX_CONT SubArray<D, T, DeviceType>::SubArray(Array<D, T, DeviceType> &array,
 template <DIM D, typename T, typename DeviceType>
 MGARDX_CONT SubArray<D, T, DeviceType>::SubArray(std::vector<SIZE> shape,
                                                  T *dv) {
-  // this->shape  = shape;
   this->dv = dv;
-  // this->ldvs_h = shape;
 
   for (DIM d = 0; d < D; d++) {
     this->_shape[d] = shape[d];
@@ -258,89 +244,6 @@ MGARDX_CONT SubArray<D, T, DeviceType>::SubArray(std::vector<SIZE> shape,
   this->lddv2 = this->_ldvs[1];
 }
 
-template <DIM D, typename T, typename DeviceType>
-MGARDX_CONT_EXEC
-SubArray<D, T, DeviceType>::SubArray(SubArray<D, T, DeviceType> &subArray) {
-  // this->shape  = subArray.shape;
-  this->dv = subArray.dv;
-  // this->ldvs_h = subArray.ldvs_h;
-  // this->ldvs_d = subArray.ldvs_d;
-
-  for (DIM d = 0; d < D; d++) {
-    this->_shape[d] = subArray.getShape(d);
-    this->_ldvs[d] = subArray._ldvs[d];
-  }
-
-  this->lddv1 = subArray.lddv1;
-  this->lddv2 = subArray.lddv2;
-
-  this->projected_dim0 = subArray.projected_dim0;
-  this->projected_dim1 = subArray.projected_dim1;
-  this->projected_dim2 = subArray.projected_dim2;
-
-  if (subArray.has_host_pointer) {
-    this->has_host_pointer = true;
-    this->v = subArray.v;
-  }
-
-  this->pitched = subArray.pitched;
-}
-
-template <DIM D, typename T, typename DeviceType>
-MGARDX_CONT_EXEC SubArray<D, T, DeviceType>::SubArray(
-    const SubArray<D, T, DeviceType> &subArray) {
-  // this->shape  = subArray.shape;
-  this->dv = subArray.dv;
-  // this->ldvs_h = subArray.ldvs_h;
-  // this->ldvs_d = subArray.ldvs_d;
-
-  for (DIM d = 0; d < D; d++) {
-    this->_shape[d] = subArray._shape[d];
-    this->_ldvs[d] = subArray._ldvs[d];
-  }
-
-  this->lddv1 = subArray.lddv1;
-  this->lddv2 = subArray.lddv2;
-
-  this->projected_dim0 = subArray.projected_dim0;
-  this->projected_dim1 = subArray.projected_dim1;
-  this->projected_dim2 = subArray.projected_dim2;
-
-  if (subArray.has_host_pointer) {
-    this->has_host_pointer = true;
-    this->v = subArray.v;
-  }
-  this->pitched = subArray.pitched;
-}
-
-template <DIM D, typename T, typename DeviceType>
-MGARDX_CONT_EXEC SubArray<D, T, DeviceType> &SubArray<D, T, DeviceType>::
-operator=(const SubArray<D, T, DeviceType> &subArray) {
-  // this->shape  = subArray.shape;
-  this->dv = subArray.dv;
-  // this->ldvs_h = subArray.ldvs_h;
-  // this->ldvs_d = subArray.ldvs_d;
-
-  for (DIM d = 0; d < D; d++) {
-    this->_shape[d] = subArray._shape[d];
-    this->_ldvs[d] = subArray._ldvs[d];
-  }
-
-  this->lddv1 = subArray.lddv1;
-  this->lddv2 = subArray.lddv2;
-
-  this->projected_dim0 = subArray.projected_dim0;
-  this->projected_dim1 = subArray.projected_dim1;
-  this->projected_dim2 = subArray.projected_dim2;
-
-  if (subArray.has_host_pointer) {
-    this->has_host_pointer = true;
-    this->v = subArray.v;
-  }
-  this->pitched = subArray.pitched;
-  return *this;
-}
-
 template <DIM D, typename T, typename DeviceType>
 MGARDX_CONT SubArray<1, T, DeviceType> SubArray<D, T, DeviceType>::Linearize() {
   SubArray<1, T, DeviceType> subArray;
@@ -348,20 +251,10 @@ MGARDX_CONT SubArray<1, T, DeviceType> SubArray<D, T, DeviceType>::Linearize() {
     SIZE linearized_shape = 1;
     for (DIM d = 0; d < D; d++)
       linearized_shape *= this->_shape[d];
-    // subArray.shape = {linearized_shape};
     subArray.dv = this->dv;
-    // subArray.ldvs_h = this->ldvs_h;
-    // subArray.ldvs_d = this->ldvs_d;
-
-    this->_shape[0] = linearized_shape;
-    this->_ldvs[0] = linearized_shape;
-
-    subArray.lddv1 = linearized_shape;
-    subArray.lddv2 = 1;
-
-    subArray.projected_dim0 = this->projected_dim0;
-    subArray.projected_dim1 = this->projected_dim1;
-    subArray.projected_dim2 = this->projected_dim2;
+    subArray.setShape(0, linearized_shape);
+    subArray.setLd(0, linearized_shape);
+    subArray.project(0, 1, 2);
 
     if (this->has_host_pointer) {
       subArray.has_host_pointer = true;
@@ -395,16 +288,6 @@ SubArray<D, T, DeviceType>::Slice3D(DIM d1, DIM d2, DIM d3) {
   subArray.setLd(2, this->_ldvs[d3]);
   subArray.project(d1, d2, d3);
 
-  // subArray.ldvs_h = this->ldvs_h;
-  // subArray.ldvs_d = this->ldvs_d;
-
-  // subArray.lddv1 = subArray.ldvs[0];
-  // subArray.lddv2 = subArray.ldvs[1];
-
-  // subArray.projected_dim0 = d1;
-  // subArray.projected_dim1 = d2;
-  // subArray.projected_dim2 = d3;
-
   if (this->has_host_pointer) {
     subArray.setDataHost(this->v);
   }
@@ -417,13 +300,11 @@ MGARDX_CONT void SubArray<D, T, DeviceType>::offset(std::vector<SIZE> idx) {
   SIZE _idx[D];
   for (DIM d = 0; d < D; d++)
     _idx[d] = idx[d];
-  // dv += get_idx(ldvs_h, idx);
   dv += calc_offset(_idx);
 }
 
 template <DIM D, typename T, typename DeviceType>
 MGARDX_CONT void SubArray<D, T, DeviceType>::resize(std::vector<SIZE> shape) {
-  // this->shape = shape;
   for (DIM d = 0; d < D; d++) {
     _shape[d] = shape[d];
   }
@@ -432,10 +313,6 @@ MGARDX_CONT void SubArray<D, T, DeviceType>::resize(std::vector<SIZE> shape) {
 template <DIM D, typename T, typename DeviceType>
 MGARDX_CONT void SubArray<D, T, DeviceType>::offset(DIM dim,
                                                     SIZE offset_value) {
-  // std::vector<SIZE> idx(D, 0);
-  // idx[dim] = offset_value;
-  // dv += get_idx(ldvs_h, idx);
-
   SIZE idx[D];
   for (DIM d = 0; d < D; d++)
     idx[d] = 0;
@@ -463,10 +340,5 @@ MGARDX_CONT void SubArray<D, T, DeviceType>::project(DIM dim0, DIM dim1,
   }
 }
 
-template <DIM D, typename T, typename DeviceType>
-MGARDX_CONT_EXEC SubArray<D, T, DeviceType>::~SubArray() {
-  // nothing needs to be released
-}
-
 } // namespace mgard_x
 #endif
\ No newline at end of file
diff --git a/include/mgard-x/RuntimeX/DataStructures/SubArrayCopy.hpp b/include/mgard-x/RuntimeX/DataStructures/SubArrayCopy.hpp
index 25b95290ce..2674a60ee9 100644
--- a/include/mgard-x/RuntimeX/DataStructures/SubArrayCopy.hpp
+++ b/include/mgard-x/RuntimeX/DataStructures/SubArrayCopy.hpp
@@ -13,9 +13,9 @@ namespace mgard_x {
 template <typename DeviceType1, typename DeviceType2>
 class CompatibleDeviceType {
   using DeviceType = std::conditional<
-      std::is_same<DeviceType1, Serial>::value &&
-          std::is_same<DeviceType2, Serial>::value,
-      Serial,
+      std::is_same<DeviceType1, SERIAL>::value &&
+          std::is_same<DeviceType2, SERIAL>::value,
+      SERIAL,
       std::conditional<
           std::is_same<DeviceType1, CUDA>::value ||
               std::is_same<DeviceType2, CUDA>::value,
@@ -26,7 +26,7 @@ class CompatibleDeviceType {
               HIP,
               std::conditional<std::is_same<DeviceType1, KOKKOS>::value ||
                                    std::is_same<DeviceType2, KOKKOS>::value,
-                               KOKKOS, None>>>>;
+                               KOKKOS, NONE>>>>;
 };
 
 template <typename SubArrayType1, typename SubArrayType2>
diff --git a/include/mgard-x/RuntimeX/DataTypes.h b/include/mgard-x/RuntimeX/DataTypes.h
index 5a63a7397f..ca9867d9fe 100644
--- a/include/mgard-x/RuntimeX/DataTypes.h
+++ b/include/mgard-x/RuntimeX/DataTypes.h
@@ -12,6 +12,8 @@
 #define MGARDX_COMPILE_CUDA
 #elif defined __HIPCC__
 #define MGARDX_COMPILE_HIP
+#elif defined SYCL_LANGUAGE_VERSION
+#define MGARDX_COMPILE_SYCL
 #else
 #define MGARDX_COMPILE_SERIAL
 #endif
@@ -58,6 +60,14 @@ namespace mgard_x {
 #define MGARDX_MANAGED __managed__
 #endif
 
+#ifdef MGARDX_COMPILE_SYCL
+#define MGARDX_CONT __inline__
+#define MGARDX_KERL
+#define MGARDX_EXEC __inline__
+#define MGARDX_CONT_EXEC __inline__
+#define MGARDX_MANAGED
+#endif
+
 #if defined MGARDX_COMPILE_KOKKOS
 #define MGARDX_CONT __inline__
 #define MGARDX_KERL
@@ -81,15 +91,16 @@ namespace mgard_x {
 #define SUBTRACT 2
 
 class Device {};
-class Serial : public Device {};
+class SERIAL : public Device {};
 class CUDA : public Device {};
 class HIP : public Device {};
-class None : public Device {};
+class SYCL : public Device {};
+class NONE : public Device {};
 
 #if defined MGARDX_COMPILE_KOKKOS
 using KOKKOS = Kokkos::DefaultExecutionSpace;
 #else
-using KOKKOS = None;
+using KOKKOS = NONE;
 #endif
 
 class DPCxx : public Device {};
diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapter.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapter.h
index a02a1247af..a8a5da6387 100644
--- a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapter.h
+++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapter.h
@@ -16,7 +16,8 @@
 namespace mgard_x {
 
 struct ExecutionReturn {
-  double execution_time = 0.0;
+  bool success = true;
+  double execution_time = std::numeric_limits<double>::max();
 };
 
 template <typename DeviceType> struct SyncBlock {
@@ -27,10 +28,22 @@ template <typename DeviceType> struct SyncGrid {
   MGARDX_EXEC static void Sync();
 };
 
-template <typename DeviceType> struct Atomic {
-  template <typename T> MGARDX_EXEC static T Min(T *result, T value);
-  template <typename T> MGARDX_EXEC static T Max(T *result, T value);
-  template <typename T> MGARDX_EXEC static T Add(T *result, T value);
+#define AtomicSystemScope 0
+#define AtomicDeviceScope 1
+#define AtomicBlockScope 2
+
+#define AtomicGlobalMemory 0
+#define AtomicSharedMemory 1
+
+#define RESOURCE_ENOUGH 0
+#define THREADBLOCK_TOO_LARGE 1
+#define SHARED_MEMORY_TOO_LARGE 2
+
+template <typename T, OPTION MemoryType, OPTION Scope, typename DeviceType>
+struct Atomic {
+  MGARDX_EXEC static T Min(T *result, T value);
+  MGARDX_EXEC static T Max(T *result, T value);
+  MGARDX_EXEC static T Add(T *result, T value);
 };
 
 template <typename DeviceType> struct Math {
@@ -99,6 +112,7 @@ template <typename DeviceType> class DeviceSpecification {
   int *MaxNumThreadsPerSM;
   int *MaxNumThreadsPerTB;
   size_t *AvailableMemory;
+  std::string *DeviceNames;
 };
 
 template <typename DeviceType> class DeviceQueues {
@@ -124,9 +138,17 @@ template <typename TaskType, typename DeviceType> class DeviceAdapter {
   MGARDX_CONT
   DeviceAdapter(){};
   MGARDX_CONT
+  int IsResourceEnough() { return false; }
+  MGARDX_CONT
   ExecutionReturn Execute(){};
 };
 
+template <typename KeyT, typename ValueT> struct KeyValueComparator {
+  bool operator()(std::pair<KeyT, ValueT> a, std::pair<KeyT, ValueT> b) const {
+    return a.first < b.first;
+  }
+};
+
 template <typename DeviceType> class DeviceCollective {
 public:
   template <typename T> MGARDX_CONT DeviceCollective(){};
@@ -215,6 +237,8 @@ template <typename DeviceType> class DeviceRuntime {
 
   MGARDX_CONT static void SyncDevice() {}
 
+  MGARDX_CONT static std::string GetDeviceName() { return ""; }
+
   MGARDX_CONT
   ~DeviceRuntime() {}
 
diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterCuda.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterCuda.h
index ec7ec4435a..c607ea5ee6 100644
--- a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterCuda.h
+++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterCuda.h
@@ -47,6 +47,28 @@ static __device__ __inline__ uint32_t __mylaneid() {
   return laneid;
 }
 
+MGARDX_EXEC static float atomicMax(float *address, float val) {
+  int *address_as_i = (int *)address;
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = ::atomicCAS(address_as_i, assumed,
+                      __float_as_int(::fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+
+MGARDX_EXEC static double atomicMax(double *address, double val) {
+  unsigned long long int *address_as_i = (unsigned long long int *)address;
+  unsigned long long int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = ::atomicCAS(address_as_i, assumed,
+                      (unsigned long long int)::fmax(val, (double)assumed));
+  } while (assumed != old);
+  return (double)old;
+}
+
 namespace mgard_x {
 
 template <typename TaskType>
@@ -92,15 +114,34 @@ template <> struct SyncGrid<CUDA> {
   MGARDX_EXEC static void Sync() { cg::this_grid().sync(); }
 };
 
-template <> struct Atomic<CUDA> {
-  template <typename T> MGARDX_EXEC static T Min(T *result, T value) {
-    return atomicMin(result, value);
+template <typename T, OPTION MemoryType, OPTION Scope>
+struct Atomic<T, MemoryType, Scope, CUDA> {
+  MGARDX_EXEC static T Min(T *result, T value) {
+    if constexpr (Scope == AtomicSystemScope) {
+      return atomicMin_system(result, value);
+    } else if constexpr (Scope == AtomicDeviceScope) {
+      return atomicMin(result, value);
+    } else {
+      return atomicMin_block(result, value);
+    }
   }
-  template <typename T> MGARDX_EXEC static T Max(T *result, T value) {
-    return atomicMax(result, value);
+  MGARDX_EXEC static T Max(T *result, T value) {
+    if constexpr (Scope == AtomicSystemScope) {
+      return atomicMax_system(result, value);
+    } else if constexpr (Scope == AtomicDeviceScope) {
+      return atomicMax(result, value);
+    } else {
+      return atomicMax_block(result, value);
+    }
   }
-  template <typename T> MGARDX_EXEC static T Add(T *result, T value) {
-    return atomicAdd(result, value);
+  MGARDX_EXEC static T Add(T *result, T value) {
+    if constexpr (Scope == AtomicSystemScope) {
+      return atomicAdd_system(result, value);
+    } else if constexpr (Scope == AtomicDeviceScope) {
+      return atomicAdd(result, value);
+    } else {
+      return atomicAdd_block(result, value);
+    }
   }
 };
 
@@ -353,6 +394,7 @@ template <> class DeviceSpecification<CUDA> {
     MaxNumThreadsPerTB = new int[NumDevices];
     AvailableMemory = new size_t[NumDevices];
     SupportCooperativeGroups = new bool[NumDevices];
+    DeviceNames = new std::string[NumDevices];
 
     for (int d = 0; d < NumDevices; d++) {
       gpuErrchk(cudaSetDevice(d));
@@ -377,6 +419,7 @@ template <> class DeviceSpecification<CUDA> {
       } else if (prop.major == 7 && (prop.minor == 2 || prop.minor == 5)) {
         ArchitectureGeneration[d] = 2;
       }
+      DeviceNames[d] = std::string(prop.name);
     }
   }
 
@@ -415,6 +458,10 @@ template <> class DeviceSpecification<CUDA> {
     return SupportCooperativeGroups[dev_id];
   }
 
+  MGARDX_CONT std::string GetDeviceName(int dev_id) {
+    return DeviceNames[dev_id];
+  }
+
   MGARDX_CONT
   ~DeviceSpecification() {
     delete[] MaxSharedMemorySize;
@@ -425,6 +472,7 @@ template <> class DeviceSpecification<CUDA> {
     delete[] MaxNumThreadsPerTB;
     delete[] AvailableMemory;
     delete[] SupportCooperativeGroups;
+    delete[] DeviceNames;
   }
 
   int NumDevices;
@@ -436,6 +484,7 @@ template <> class DeviceSpecification<CUDA> {
   int *MaxNumThreadsPerTB;
   size_t *AvailableMemory;
   bool *SupportCooperativeGroups;
+  std::string *DeviceNames;
 };
 
 template <> class DeviceQueues<CUDA> {
@@ -516,6 +565,10 @@ template <> class DeviceRuntime<CUDA> {
     gpuErrchk(cudaDeviceSynchronize());
   }
 
+  MGARDX_CONT static std::string GetDeviceName() {
+    return DeviceSpecs.GetDeviceName(curr_dev_id);
+  }
+
   MGARDX_CONT static int GetMaxSharedMemorySize() {
     return DeviceSpecs.GetMaxSharedMemorySize(curr_dev_id);
   }
@@ -1816,6 +1869,19 @@ template <typename TaskType> class DeviceAdapter<TaskType, CUDA> {
   MGARDX_CONT
   DeviceAdapter(){};
 
+  MGARDX_CONT
+  int IsResourceEnough(TaskType &task) {
+    if (task.GetBlockDimX() * task.GetBlockDimY() * task.GetBlockDimZ() >
+        DeviceRuntime<CUDA>::GetMaxNumThreadsPerTB()) {
+      return THREADBLOCK_TOO_LARGE;
+    }
+    if (task.GetSharedMemorySize() >
+        DeviceRuntime<CUDA>::GetMaxSharedMemorySize()) {
+      return SHARED_MEMORY_TOO_LARGE;
+    }
+    return RESOURCE_ENOUGH;
+  }
+
   MGARDX_CONT
   ExecutionReturn Execute(TaskType &task) {
 
@@ -1834,6 +1900,21 @@ template <typename TaskType> class DeviceAdapter<TaskType, CUDA> {
                 << task.GetGridDimY() << ", " << task.GetGridDimZ() << ">\n";
     }
 
+    ExecutionReturn ret;
+    if (IsResourceEnough(task) != RESOURCE_ENOUGH) {
+      if (DeviceRuntime<CUDA>::PrintKernelConfig) {
+        if (IsResourceEnough(task) == THREADBLOCK_TOO_LARGE) {
+          std::cout << log::log_info << "threadblock too large.\n";
+        }
+        if (IsResourceEnough(task) == SHARED_MEMORY_TOO_LARGE) {
+          std::cout << log::log_info << "shared memory too large.\n";
+        }
+      }
+      ret.success = false;
+      ret.execution_time = std::numeric_limits<double>::max();
+      return ret;
+    }
+
     Timer timer;
     if (DeviceRuntime<CUDA>::TimingAllKernels ||
         AutoTuner<CUDA>::ProfileKernels) {
@@ -1875,8 +1956,6 @@ template <typename TaskType> class DeviceAdapter<TaskType, CUDA> {
       ErrorSyncCheck(cudaDeviceSynchronize(), task);
     }
 
-    ExecutionReturn ret;
-
     if (DeviceRuntime<CUDA>::TimingAllKernels ||
         AutoTuner<CUDA>::ProfileKernels) {
       DeviceRuntime<CUDA>::SyncDevice();
@@ -1885,6 +1964,7 @@ template <typename TaskType> class DeviceAdapter<TaskType, CUDA> {
         timer.print(task.GetFunctorName());
       }
       if (AutoTuner<CUDA>::ProfileKernels) {
+        ret.success = true;
         ret.execution_time = timer.get();
       }
     }
diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterHip.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterHip.h
index d8a245ceaa..70f35338fa 100644
--- a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterHip.h
+++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterHip.h
@@ -87,15 +87,34 @@ template <> struct SyncGrid<HIP> {
   MGARDX_EXEC static void Sync() { cg::this_grid().sync(); }
 };
 
-template <> struct Atomic<HIP> {
-  template <typename T> MGARDX_EXEC static T Min(T *result, T value) {
-    return atomicMin(result, value);
+template <typename T, OPTION MemoryType, OPTION Scope>
+struct Atomic<T, MemoryType, Scope, HIP> {
+  MGARDX_EXEC static T Min(T *result, T value) {
+    if constexpr (Scope == AtomicSystemScope) {
+      return atomicMin_system(result, value);
+    } else if constexpr (Scope == AtomicDeviceScope) {
+      return atomicMin(result, value);
+    } else {
+      return atomicMin_block(result, value);
+    }
   }
-  template <typename T> MGARDX_EXEC static T Max(T *result, T value) {
-    return atomicMax(result, value);
+  MGARDX_EXEC static T Max(T *result, T value) {
+    if constexpr (Scope == AtomicSystemScope) {
+      return atomicMax_system(result, value);
+    } else if constexpr (Scope == AtomicDeviceScope) {
+      return atomicMax(result, value);
+    } else {
+      return atomicMax_block(result, value);
+    }
   }
-  template <typename T> MGARDX_EXEC static T Add(T *result, T value) {
-    return atomicAdd(result, value);
+  MGARDX_EXEC static T Add(T *result, T value) {
+    if constexpr (Scope == AtomicSystemScope) {
+      return atomicAdd_system(result, value);
+    } else if constexpr (Scope == AtomicDeviceScope) {
+      return atomicAdd(result, value);
+    } else {
+      return atomicAdd_block(result, value);
+    }
   }
 };
 
@@ -348,6 +367,7 @@ template <> class DeviceSpecification<HIP> {
     MaxNumThreadsPerTB = new int[NumDevices];
     AvailableMemory = new size_t[NumDevices];
     SupportCooperativeGroups = new bool[NumDevices];
+    DeviceNames = new std::string[NumDevices];
 
     for (int d = 0; d < NumDevices; d++) {
       gpuErrchk(hipSetDevice(d));
@@ -373,6 +393,8 @@ template <> class DeviceSpecification<HIP> {
       }
       MaxNumThreadsPerTB[d] = 32; // Due to a bug in Cooperative Groups in HIP
       WarpSize[d] = 32;
+      // DeviceNames[d] = std::string(prop.name); // Not working in HIP
+      DeviceNames[d] = std::string("AMD GPU");
     }
   }
 
@@ -410,6 +432,10 @@ template <> class DeviceSpecification<HIP> {
     return SupportCooperativeGroups[dev_id];
   }
 
+  MGARDX_CONT std::string GetDeviceName(int dev_id) {
+    return DeviceNames[dev_id];
+  }
+
   MGARDX_CONT
   ~DeviceSpecification() {
     delete[] MaxSharedMemorySize;
@@ -420,6 +446,7 @@ template <> class DeviceSpecification<HIP> {
     delete[] MaxNumThreadsPerTB;
     delete[] AvailableMemory;
     delete[] SupportCooperativeGroups;
+    delete[] DeviceNames;
   }
 
   int NumDevices;
@@ -431,6 +458,7 @@ template <> class DeviceSpecification<HIP> {
   int *MaxNumThreadsPerTB;
   size_t *AvailableMemory;
   bool *SupportCooperativeGroups;
+  std::string *DeviceNames;
 };
 
 template <> class DeviceQueues<HIP> {
@@ -511,6 +539,10 @@ template <> class DeviceRuntime<HIP> {
     gpuErrchk(hipDeviceSynchronize());
   }
 
+  MGARDX_CONT static std::string GetDeviceName() {
+    return DeviceSpecs.GetDeviceName(curr_dev_id);
+  }
+
   MGARDX_CONT static int GetMaxSharedMemorySize() {
     return DeviceSpecs.GetMaxSharedMemorySize(curr_dev_id);
   }
@@ -1771,6 +1803,19 @@ template <typename TaskType> class DeviceAdapter<TaskType, HIP> {
   MGARDX_CONT
   DeviceAdapter(){};
 
+  MGARDX_CONT
+  int IsResourceEnough(TaskType &task) {
+    if (task.GetBlockDimX() * task.GetBlockDimY() * task.GetBlockDimZ() >
+        DeviceRuntime<HIP>::GetMaxNumThreadsPerTB()) {
+      return THREADBLOCK_TOO_LARGE;
+    }
+    if (task.GetSharedMemorySize() >
+        DeviceRuntime<HIP>::GetMaxSharedMemorySize()) {
+      return SHARED_MEMORY_TOO_LARGE;
+    }
+    return RESOURCE_ENOUGH;
+  }
+
   MGARDX_CONT
   ExecutionReturn Execute(TaskType &task) {
     dim3 threadsPerBlock(task.GetBlockDimX(), task.GetBlockDimY(),
@@ -1790,6 +1835,21 @@ template <typename TaskType> class DeviceAdapter<TaskType, HIP> {
                 << task.GetGridDimY() << ", " << task.GetGridDimZ() << ">\n";
     }
 
+    ExecutionReturn ret;
+    if (IsResourceEnough(task) != RESOURCE_ENOUGH) {
+      if (DeviceRuntime<HIP>::PrintKernelConfig) {
+        if (IsResourceEnough(task) == THREADBLOCK_TOO_LARGE) {
+          std::cout << log::log_info << "threadblock too large.\n";
+        }
+        if (IsResourceEnough(task) == SHARED_MEMORY_TOO_LARGE) {
+          std::cout << log::log_info << "shared memory too large.\n";
+        }
+      }
+      ret.success = false;
+      ret.execution_time = std::numeric_limits<double>::max();
+      return ret;
+    }
+
     Timer timer;
     if (DeviceRuntime<HIP>::TimingAllKernels ||
         AutoTuner<HIP>::ProfileKernels) {
@@ -1831,7 +1891,6 @@ template <typename TaskType> class DeviceAdapter<TaskType, HIP> {
       ErrorSyncCheck(hipDeviceSynchronize(), task);
     }
 
-    ExecutionReturn ret;
     if (DeviceRuntime<HIP>::TimingAllKernels ||
         AutoTuner<HIP>::ProfileKernels) {
       DeviceRuntime<HIP>::SyncDevice();
@@ -1840,6 +1899,7 @@ template <typename TaskType> class DeviceAdapter<TaskType, HIP> {
         timer.print(task.GetFunctorName());
       }
       if (AutoTuner<HIP>::ProfileKernels) {
+        ret.success = true;
         ret.execution_time = timer.get();
       }
     }
diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.h
index 2b7e94c83e..43bfa198c3 100644
--- a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.h
+++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.h
@@ -12,30 +12,31 @@
 
 namespace mgard_x {
 
-template <> struct SyncBlock<Serial> {
+template <> struct SyncBlock<SERIAL> {
   MGARDX_EXEC static void Sync() {
     // do nothing
   }
 };
 
-template <> struct SyncGrid<Serial> {
+template <> struct SyncGrid<SERIAL> {
   MGARDX_EXEC static void Sync() {
     // do nothing
   }
 };
 
-template <> struct Atomic<Serial> {
-  template <typename T> MGARDX_EXEC static T Min(T *result, T value) {
+template <typename T, OPTION MemoryType, OPTION Scope>
+struct Atomic<T, MemoryType, Scope, SERIAL> {
+  MGARDX_EXEC static T Min(T *result, T value) {
     T old = *result;
     *result = std::min(*result, value);
     return old;
   }
-  template <typename T> MGARDX_EXEC static T Max(T *result, T value) {
+  MGARDX_EXEC static T Max(T *result, T value) {
     T old = *result;
     *result = std::max(*result, value);
     return old;
   }
-  template <typename T> MGARDX_EXEC static T Add(T *result, T value) {
+  MGARDX_EXEC static T Add(T *result, T value) {
     T old = *result;
     *result += value;
     return old;
@@ -62,7 +63,7 @@ static const int MultiplyDeBruijnBitPosition[64] = {
 
     62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58};
 
-template <> struct Math<Serial> {
+template <> struct Math<SERIAL> {
   template <typename T> MGARDX_EXEC static T Min(T a, T b) {
     return std::min(a, b);
   }
@@ -617,7 +618,7 @@ MGARDX_KERL void SerialHuffmanCWCustomizedKernel(TaskType task) {
   DEALLOC_ACTIVE_GRID(loop1_active);
 }
 
-template <> class DeviceSpecification<Serial> {
+template <> class DeviceSpecification<SERIAL> {
 public:
   MGARDX_CONT
   DeviceSpecification() {
@@ -630,6 +631,7 @@ template <> class DeviceSpecification<Serial> {
     MaxNumThreadsPerTB = new int[NumDevices];
     AvailableMemory = new size_t[NumDevices];
     SupportCooperativeGroups = new bool[NumDevices];
+    DeviceNames = new std::string[NumDevices];
 
     for (int d = 0; d < NumDevices; d++) {
       MaxSharedMemorySize[d] = 1e6;
@@ -639,6 +641,7 @@ template <> class DeviceSpecification<Serial> {
       MaxNumThreadsPerTB[d] = 1024;
       ArchitectureGeneration[d] = 1;
       SupportCooperativeGroups[d] = true;
+      DeviceNames[d] = "CPU";
     }
   }
 
@@ -673,6 +676,10 @@ template <> class DeviceSpecification<Serial> {
     return SupportCooperativeGroups[dev_id];
   }
 
+  MGARDX_CONT std::string GetDeviceName(int dev_id) {
+    return DeviceNames[dev_id];
+  }
+
   MGARDX_CONT
   ~DeviceSpecification() {
     delete[] MaxSharedMemorySize;
@@ -683,6 +690,7 @@ template <> class DeviceSpecification<Serial> {
     delete[] MaxNumThreadsPerTB;
     delete[] AvailableMemory;
     delete[] SupportCooperativeGroups;
+    delete[] DeviceNames;
   }
 
   int NumDevices;
@@ -694,9 +702,10 @@ template <> class DeviceSpecification<Serial> {
   int *MaxNumThreadsPerTB;
   size_t *AvailableMemory;
   bool *SupportCooperativeGroups;
+  std::string *DeviceNames;
 };
 
-template <> class DeviceQueues<Serial> {
+template <> class DeviceQueues<SERIAL> {
 public:
   MGARDX_CONT
   DeviceQueues() {
@@ -719,7 +728,7 @@ template <> class DeviceQueues<Serial> {
   }
 };
 
-template <> class DeviceRuntime<Serial> {
+template <> class DeviceRuntime<SERIAL> {
 public:
   MGARDX_CONT
   DeviceRuntime() {}
@@ -744,6 +753,10 @@ template <> class DeviceRuntime<Serial> {
     // do nothing
   }
 
+  MGARDX_CONT static std::string GetDeviceName() {
+    return DeviceSpecs.GetDeviceName(curr_dev_id);
+  }
+
   MGARDX_CONT static int GetMaxSharedMemorySize() {
     return DeviceSpecs.GetMaxSharedMemorySize(curr_dev_id);
   }
@@ -793,14 +806,14 @@ template <> class DeviceRuntime<Serial> {
   ~DeviceRuntime() {}
 
   static int curr_dev_id;
-  static DeviceQueues<Serial> queues;
+  static DeviceQueues<SERIAL> queues;
   static bool SyncAllKernelsAndCheckErrors;
-  static DeviceSpecification<Serial> DeviceSpecs;
+  static DeviceSpecification<SERIAL> DeviceSpecs;
   static bool TimingAllKernels;
   static bool PrintKernelConfig;
 };
 
-template <> class MemoryManager<Serial> {
+template <> class MemoryManager<SERIAL> {
 public:
   MGARDX_CONT
   MemoryManager(){};
@@ -811,7 +824,7 @@ template <> class MemoryManager<Serial> {
         typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
     ptr = (T *)std::malloc(n * sizeof(converted_T));
     if (ptr == NULL) {
-      std::cout << log::log_err << "MemoryManager<Serial>::Malloc1D error.\n";
+      std::cout << log::log_err << "MemoryManager<SERIAL>::Malloc1D error.\n";
     }
   }
 
@@ -823,7 +836,7 @@ template <> class MemoryManager<Serial> {
     ptr = (T *)std::malloc(n1 * n2 * sizeof(converted_T));
     ld = n1;
     if (ptr == NULL) {
-      std::cout << log::log_err << "MemoryManager<Serial>::Malloc1D error.\n";
+      std::cout << log::log_err << "MemoryManager<SERIAL>::Malloc1D error.\n";
     }
   }
 
@@ -834,7 +847,7 @@ template <> class MemoryManager<Serial> {
     ptr = (T *)std::malloc(n * sizeof(converted_T));
     if (ptr == NULL) {
       std::cout << log::log_err
-                << "MemoryManager<Serial>::MallocManaged1D error.\n";
+                << "MemoryManager<SERIAL>::MallocManaged1D error.\n";
     }
   }
 
@@ -866,7 +879,7 @@ template <> class MemoryManager<Serial> {
         typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
     ptr = (T *)std::malloc(n * sizeof(converted_T));
     if (ptr == NULL) {
-      std::cout << log::log_err << "MemoryManager<Serial>::Malloc1D error.\n";
+      std::cout << log::log_err << "MemoryManager<SERIAL>::Malloc1D error.\n";
     }
   }
 
@@ -940,7 +953,7 @@ typedef unsigned long long int uint64_cu;
 template <typename T_org, typename T_trans, SIZE nblockx, SIZE nblocky,
           SIZE nblockz, OPTION ALIGN, OPTION METHOD>
 struct BlockBitTranspose<T_org, T_trans, nblockx, nblocky, nblockz, ALIGN,
-                         METHOD, Serial> {
+                         METHOD, SERIAL> {
 
   MGARDX_EXEC
   static void Serial_All(T_org *v, T_trans *tv, SIZE b, SIZE B, SIZE IdX,
@@ -988,7 +1001,7 @@ template <typename T, typename T_fp, typename T_sfp, typename T_error,
           SIZE nblockx, SIZE nblocky, SIZE nblockz, OPTION METHOD,
           OPTION BinaryType>
 struct BlockErrorCollect<T, T_fp, T_sfp, T_error, nblockx, nblocky, nblockz,
-                         METHOD, BinaryType, Serial> {
+                         METHOD, BinaryType, SERIAL> {
 
   MGARDX_EXEC
   static void Serial_All(T *v, T_error *temp, T_error *errors, SIZE num_elems,
@@ -998,7 +1011,7 @@ struct BlockErrorCollect<T, T_fp, T_sfp, T_error, nblockx, nblocky, nblockz,
         T data = v[elem_idx];
         T_fp fp_data = (T_fp)fabs(data);
         T_sfp fps_data = (T_sfp)data;
-        T_fp ngb_data = Math<Serial>::binary2negabinary(fps_data);
+        T_fp ngb_data = Math<SERIAL>::binary2negabinary(fps_data);
         T_error mantissa;
         if (BinaryType == BINARY) {
           mantissa = fabs(data) - fp_data;
@@ -1012,7 +1025,7 @@ struct BlockErrorCollect<T, T_fp, T_sfp, T_error, nblockx, nblocky, nblockz,
           if (BinaryType == BINARY) {
             diff = (T_error)(fp_data & mask) + mantissa;
           } else if (BinaryType == NEGABINARY) {
-            diff = (T_error)Math<Serial>::negabinary2binary(ngb_data & mask) +
+            diff = (T_error)Math<SERIAL>::negabinary2binary(ngb_data & mask) +
                    mantissa;
           }
           errors[num_bitplanes - bitplane_idx] += diff * diff;
@@ -1032,56 +1045,81 @@ struct BlockErrorCollect<T, T_fp, T_sfp, T_error, nblockx, nblocky, nblockz,
   }
 };
 
-template <typename TaskTypeType> class DeviceAdapter<TaskTypeType, Serial> {
+template <typename TaskType> class DeviceAdapter<TaskType, SERIAL> {
 public:
   MGARDX_CONT
   DeviceAdapter(){};
 
   MGARDX_CONT
-  ExecutionReturn Execute(TaskTypeType &task) {
+  int IsResourceEnough(TaskType &task) {
+    if (task.GetBlockDimX() * task.GetBlockDimY() * task.GetBlockDimZ() >
+        DeviceRuntime<SERIAL>::GetMaxNumThreadsPerTB()) {
+      return THREADBLOCK_TOO_LARGE;
+    }
+    if (task.GetSharedMemorySize() >
+        DeviceRuntime<SERIAL>::GetMaxSharedMemorySize()) {
+      return SHARED_MEMORY_TOO_LARGE;
+    }
+    return RESOURCE_ENOUGH;
+  }
+
+  MGARDX_CONT
+  ExecutionReturn Execute(TaskType &task) {
 
-    if (DeviceRuntime<Serial>::PrintKernelConfig) {
+    if (DeviceRuntime<SERIAL>::PrintKernelConfig) {
       std::cout << log::log_info << task.GetFunctorName() << ": <"
                 << task.GetBlockDimX() << ", " << task.GetBlockDimY() << ", "
                 << task.GetBlockDimZ() << "> <" << task.GetGridDimX() << ", "
                 << task.GetGridDimY() << ", " << task.GetGridDimZ() << ">\n";
     }
 
+    ExecutionReturn ret;
+    if (IsResourceEnough(task) != RESOURCE_ENOUGH) {
+      if (DeviceRuntime<SERIAL>::PrintKernelConfig) {
+        if (IsResourceEnough(task) == THREADBLOCK_TOO_LARGE) {
+          std::cout << log::log_info << "threadblock too large.\n";
+        }
+        if (IsResourceEnough(task) == SHARED_MEMORY_TOO_LARGE) {
+          std::cout << log::log_info << "shared memory too large.\n";
+        }
+      }
+      ret.success = false;
+      ret.execution_time = std::numeric_limits<double>::max();
+      return ret;
+    }
+
     Timer timer;
-    if (DeviceRuntime<Serial>::TimingAllKernels ||
-        AutoTuner<Serial>::ProfileKernels) {
-      DeviceRuntime<Serial>::SyncDevice();
+    if (DeviceRuntime<SERIAL>::TimingAllKernels ||
+        AutoTuner<SERIAL>::ProfileKernels) {
+      DeviceRuntime<SERIAL>::SyncDevice();
       timer.start();
     }
     // if constexpr evalute at compile time otherwise this does not compile
-    if constexpr (std::is_base_of<Functor<Serial>,
-                                  typename TaskTypeType::Functor>::value) {
+    if constexpr (std::is_base_of<Functor<SERIAL>,
+                                  typename TaskType::Functor>::value) {
       SerialKernel(task);
-    } else if constexpr (std::is_base_of<
-                             IterFunctor<Serial>,
-                             typename TaskTypeType::Functor>::value) {
+    } else if constexpr (std::is_base_of<IterFunctor<SERIAL>,
+                                         typename TaskType::Functor>::value) {
       SerialIterKernel(task);
-    } else if constexpr (std::is_base_of<
-                             HuffmanCLCustomizedFunctor<Serial>,
-                             typename TaskTypeType::Functor>::value) {
+    } else if constexpr (std::is_base_of<HuffmanCLCustomizedFunctor<SERIAL>,
+                                         typename TaskType::Functor>::value) {
       SerialHuffmanCLCustomizedKernel(task);
-    } else if constexpr (std::is_base_of<
-                             HuffmanCWCustomizedFunctor<Serial>,
-                             typename TaskTypeType::Functor>::value) {
+    } else if constexpr (std::is_base_of<HuffmanCWCustomizedFunctor<SERIAL>,
+                                         typename TaskType::Functor>::value) {
       SerialHuffmanCWCustomizedKernel(task);
     }
     // timer.end();
     // timer.print(task.GetFunctorName());
     // timer.clear();
-    ExecutionReturn ret;
-    if (DeviceRuntime<Serial>::TimingAllKernels ||
-        AutoTuner<Serial>::ProfileKernels) {
-      DeviceRuntime<Serial>::SyncDevice();
+    if (DeviceRuntime<SERIAL>::TimingAllKernels ||
+        AutoTuner<SERIAL>::ProfileKernels) {
+      DeviceRuntime<SERIAL>::SyncDevice();
       timer.end();
-      if (DeviceRuntime<Serial>::TimingAllKernels) {
+      if (DeviceRuntime<SERIAL>::TimingAllKernels) {
         timer.print(task.GetFunctorName());
       }
-      if (AutoTuner<Serial>::ProfileKernels) {
+      if (AutoTuner<SERIAL>::ProfileKernels) {
+        ret.success = true;
         ret.execution_time = timer.get();
       }
     }
@@ -1089,26 +1127,20 @@ template <typename TaskTypeType> class DeviceAdapter<TaskTypeType, Serial> {
   }
 };
 
-template <typename KeyT, typename ValueT> struct KeyValueComparator {
-  bool operator()(std::pair<KeyT, ValueT> a, std::pair<KeyT, ValueT> b) const {
-    return a.first < b.first;
-  }
-};
-
-template <> class DeviceCollective<Serial> {
+template <> class DeviceCollective<SERIAL> {
 public:
   MGARDX_CONT
   DeviceCollective(){};
 
   template <typename T>
-  MGARDX_CONT static void Sum(SIZE n, SubArray<1, T, Serial> &v,
-                              SubArray<1, T, Serial> &result, int queue_idx) {
+  MGARDX_CONT static void Sum(SIZE n, SubArray<1, T, SERIAL> &v,
+                              SubArray<1, T, SERIAL> &result, int queue_idx) {
     *result((IDX)0) = std::accumulate(v((IDX)0), v((IDX)n), 0);
   }
 
   template <typename T>
-  MGARDX_CONT static void AbsMax(SIZE n, SubArray<1, T, Serial> &v,
-                                 SubArray<1, T, Serial> &result,
+  MGARDX_CONT static void AbsMax(SIZE n, SubArray<1, T, SERIAL> &v,
+                                 SubArray<1, T, SERIAL> &result,
                                  int queue_idx) {
     T max_result = 0;
     for (SIZE i = 0; i < n; ++i) {
@@ -1118,8 +1150,8 @@ template <> class DeviceCollective<Serial> {
   }
 
   template <typename T>
-  MGARDX_CONT static void SquareSum(SIZE n, SubArray<1, T, Serial> &v,
-                                    SubArray<1, T, Serial> &result,
+  MGARDX_CONT static void SquareSum(SIZE n, SubArray<1, T, SERIAL> &v,
+                                    SubArray<1, T, SERIAL> &result,
                                     int queue_idx) {
     T sum_result = 0;
     for (SIZE i = 0; i < n; ++i) {
@@ -1130,33 +1162,33 @@ template <> class DeviceCollective<Serial> {
   }
 
   template <typename T>
-  MGARDX_CONT static void ScanSumInclusive(SIZE n, SubArray<1, T, Serial> &v,
-                                           SubArray<1, T, Serial> &result,
+  MGARDX_CONT static void ScanSumInclusive(SIZE n, SubArray<1, T, SERIAL> &v,
+                                           SubArray<1, T, SERIAL> &result,
                                            int queue_idx) {
     // std::inclusive_scan(v(0), v(n), result(0));
-    std::cout << log::log_err << "ScanSumInclusive<Serial> not implemented.\n";
+    std::cout << log::log_err << "ScanSumInclusive<SERIAL> not implemented.\n";
   }
 
   template <typename T>
-  MGARDX_CONT static void ScanSumExclusive(SIZE n, SubArray<1, T, Serial> &v,
-                                           SubArray<1, T, Serial> &result,
+  MGARDX_CONT static void ScanSumExclusive(SIZE n, SubArray<1, T, SERIAL> &v,
+                                           SubArray<1, T, SERIAL> &result,
                                            int queue_idx) {
     // std::exclusive_scan(v(0), v(n), result(0));
-    std::cout << log::log_err << "ScanSumExclusive<Serial> not implemented.\n";
+    std::cout << log::log_err << "ScanSumExclusive<SERIAL> not implemented.\n";
   }
 
   template <typename T>
-  MGARDX_CONT static void ScanSumExtended(SIZE n, SubArray<1, T, Serial> &v,
-                                          SubArray<1, T, Serial> &result,
+  MGARDX_CONT static void ScanSumExtended(SIZE n, SubArray<1, T, SERIAL> &v,
+                                          SubArray<1, T, SERIAL> &result,
                                           int queue_idx) {
     // std::inclusive_scan(v(0), v(n), result(1));
     // result(0) = 0;
-    std::cout << log::log_err << "ScanSumExtended<Serial> not implemented.\n";
+    std::cout << log::log_err << "ScanSumExtended<SERIAL> not implemented.\n";
   }
 
   template <typename KeyT, typename ValueT>
-  MGARDX_CONT static void SortByKey(SIZE n, SubArray<1, KeyT, Serial> &keys,
-                                    SubArray<1, ValueT, Serial> &values,
+  MGARDX_CONT static void SortByKey(SIZE n, SubArray<1, KeyT, SERIAL> &keys,
+                                    SubArray<1, ValueT, SERIAL> &values,
                                     int queue_idx) {
     std::vector<std::pair<KeyT, ValueT>> data(n);
     for (SIZE i = 0; i < n; ++i) {
diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.h
new file mode 100644
index 0000000000..e39ea274e0
--- /dev/null
+++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.h
@@ -0,0 +1,1151 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "DeviceAdapter.h"
+#include <CL/sycl.hpp>
+
+#ifndef MGARD_X_DEVICE_ADAPTER_SYCL_H
+#define MGARD_X_DEVICE_ADAPTER_SYCL_H
+
+namespace mgard_x {
+
+using LocalMemory = sycl::accessor<Byte, 1, sycl::access::mode::read_write,
+                                   sycl::access::target::local>;
+
+// Create an exception sycl::handler for asynchronous SYCL exceptions
+static auto exception_handler = [](sycl::exception_list e_list) {
+  for (std::exception_ptr const &e : e_list) {
+    try {
+      std::rethrow_exception(e);
+    } catch (std::exception const &e) {
+      std::cout << "Failure" << std::endl;
+      std::terminate();
+    }
+  }
+};
+
+template <typename T, OPTION MemoryType, OPTION Scope>
+struct Atomic<T, MemoryType, Scope, SYCL> {
+  MGARDX_EXEC static T Min(T *result, T value) {
+    if constexpr (MemoryType == AtomicGlobalMemory) {
+      if constexpr (Scope == AtomicSystemScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::system,
+            sycl::access::address_space::global_space>;
+        return AtomicRef(result[0]).fetch_min(value);
+      } else if constexpr (Scope == AtomicDeviceScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::device,
+            sycl::access::address_space::global_space>;
+        return AtomicRef(result[0]).fetch_min(value);
+      } else {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::work_group,
+            sycl::access::address_space::global_space>;
+        return AtomicRef(result[0]).fetch_min(value);
+      }
+    } else {
+      if constexpr (Scope == AtomicSystemScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::system,
+            sycl::access::address_space::local_space>;
+        return AtomicRef(result[0]).fetch_min(value);
+      } else if constexpr (Scope == AtomicDeviceScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::device,
+            sycl::access::address_space::local_space>;
+        return AtomicRef(result[0]).fetch_min(value);
+      } else {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::work_group,
+            sycl::access::address_space::local_space>;
+        return AtomicRef(result[0]).fetch_min(value);
+      }
+    }
+  }
+  MGARDX_EXEC static T Max(T *result, T value) {
+    if constexpr (MemoryType == AtomicGlobalMemory) {
+      if constexpr (Scope == AtomicSystemScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::system,
+            sycl::access::address_space::global_space>;
+        return AtomicRef(result[0]).fetch_max(value);
+      } else if constexpr (Scope == AtomicDeviceScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::device,
+            sycl::access::address_space::global_space>;
+        return AtomicRef(result[0]).fetch_max(value);
+      } else {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::work_group,
+            sycl::access::address_space::global_space>;
+        return AtomicRef(result[0]).fetch_max(value);
+      }
+    } else {
+      if constexpr (Scope == AtomicSystemScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::system,
+            sycl::access::address_space::local_space>;
+        return AtomicRef(result[0]).fetch_max(value);
+      } else if constexpr (Scope == AtomicDeviceScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::device,
+            sycl::access::address_space::local_space>;
+        return AtomicRef(result[0]).fetch_max(value);
+      } else {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::work_group,
+            sycl::access::address_space::local_space>;
+        return AtomicRef(result[0]).fetch_max(value);
+      }
+    }
+  }
+  MGARDX_EXEC static T Add(T *result, T value) {
+    if constexpr (MemoryType == AtomicGlobalMemory) {
+      if constexpr (Scope == AtomicSystemScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::system,
+            sycl::access::address_space::global_space>;
+        T new_value = AtomicRef(result[0]) += value;
+        return new_value - value;
+      } else if constexpr (Scope == AtomicDeviceScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::device,
+            sycl::access::address_space::global_space>;
+        T new_value = AtomicRef(result[0]) += value;
+        return new_value - value;
+      } else {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::work_group,
+            sycl::access::address_space::global_space>;
+        T new_value = AtomicRef(result[0]) += value;
+        return new_value - value;
+      }
+    } else {
+      if constexpr (Scope == AtomicSystemScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::system,
+            sycl::access::address_space::local_space>;
+        T new_value = AtomicRef(result[0]) += value;
+        return new_value - value;
+      } else if constexpr (Scope == AtomicDeviceScope) {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::device,
+            sycl::access::address_space::local_space>;
+        T new_value = AtomicRef(result[0]) += value;
+        return new_value - value;
+      } else {
+        using AtomicRef = sycl::ext::oneapi::atomic_ref<
+            T, sycl::memory_order::relaxed, sycl::memory_scope::work_group,
+            sycl::access::address_space::local_space>;
+        T new_value = AtomicRef(result[0]) += value;
+        return new_value - value;
+      }
+    }
+  }
+};
+
+template <> struct Math<SYCL> {
+  template <typename T> MGARDX_EXEC static T Min(T a, T b) {
+    if constexpr (std::is_integral<T>::value)
+      return sycl::min(a, b);
+    else {
+      return sycl::fmin(a, b);
+    }
+  }
+  template <typename T> MGARDX_EXEC static T Max(T a, T b) {
+    if constexpr (std::is_integral<T>::value)
+      return sycl::max(a, b);
+    else {
+      return sycl::fmax(a, b);
+    }
+  }
+  MGARDX_EXEC static int ffs(unsigned int a) {
+    int pos = 0;
+    if (a == 0)
+      return pos;
+    while (!(a & 1)) {
+      a >>= 1;
+      ++pos;
+    }
+    return pos + 1;
+  }
+  MGARDX_EXEC static int ffsll(long long unsigned int a) {
+    int pos = 0;
+    if (a == 0)
+      return pos;
+    while (!(a & 1)) {
+      a >>= 1;
+      ++pos;
+    }
+    return pos + 1;
+  }
+  MGARDX_EXEC
+  static uint64_t binary2negabinary(const int64_t x) {
+    return (x + (uint64_t)0xaaaaaaaaaaaaaaaaull) ^
+           (uint64_t)0xaaaaaaaaaaaaaaaaull;
+  }
+
+  MGARDX_EXEC
+  static uint32_t binary2negabinary(const int32_t x) {
+    return (x + (uint32_t)0xaaaaaaaau) ^ (uint32_t)0xaaaaaaaau;
+  }
+
+  MGARDX_EXEC
+  static int64_t negabinary2binary(const uint64_t x) {
+    return (x ^ 0xaaaaaaaaaaaaaaaaull) - 0xaaaaaaaaaaaaaaaaull;
+  }
+
+  MGARDX_EXEC
+  static int32_t negabinary2binary(const uint32_t x) {
+    return (x ^ 0xaaaaaaaau) - 0xaaaaaaaau;
+  }
+};
+
+template <> class DeviceSpecification<SYCL> {
+public:
+  MGARDX_CONT
+  DeviceSpecification() {
+    sycl::default_selector d_selector;
+    sycl::platform d_platform(d_selector);
+    std::vector<sycl::device> d_devices = d_platform.get_devices();
+    NumDevices = d_devices.size();
+    MaxSharedMemorySize = new int[NumDevices];
+    WarpSize = new int[NumDevices];
+    NumSMs = new int[NumDevices];
+    ArchitectureGeneration = new int[NumDevices];
+    MaxNumThreadsPerSM = new int[NumDevices];
+    MaxNumThreadsPerTB = new int[NumDevices];
+    AvailableMemory = new size_t[NumDevices];
+    SupportCooperativeGroups = new bool[NumDevices];
+    DeviceNames = new std::string[NumDevices];
+
+    int d = 0;
+    for (auto &device : d_devices) {
+      MaxSharedMemorySize[d] =
+          device.get_info<sycl::info::device::local_mem_size>();
+      WarpSize[d] = 32;
+      NumSMs[d] = device.get_info<sycl::info::device::max_compute_units>();
+      MaxNumThreadsPerSM[d] =
+          device.get_info<sycl::info::device::max_work_group_size>();
+      ;
+      // Larger limit can cause resource insufficient error
+      MaxNumThreadsPerTB[d] = std::min(
+          1024ul, device.get_info<sycl::info::device::max_work_group_size>());
+      ;
+      AvailableMemory[d] =
+          device.get_info<sycl::info::device::global_mem_size>();
+      SupportCooperativeGroups[d] = false;
+      DeviceNames[d] = std::string(device.get_info<sycl::info::device::name>());
+      d++;
+    }
+  }
+
+  MGARDX_CONT int GetNumDevices() { return NumDevices; }
+
+  MGARDX_CONT int GetMaxSharedMemorySize(int dev_id) {
+    return MaxSharedMemorySize[dev_id];
+  }
+
+  MGARDX_CONT int GetWarpSize(int dev_id) { return WarpSize[dev_id]; }
+
+  MGARDX_CONT int GetNumSMs(int dev_id) { return NumSMs[dev_id]; }
+
+  MGARDX_CONT int GetArchitectureGeneration(int dev_id) {
+    return ArchitectureGeneration[dev_id];
+  }
+
+  MGARDX_CONT int GetMaxNumThreadsPerSM(int dev_id) {
+    return MaxNumThreadsPerSM[dev_id];
+  }
+
+  MGARDX_CONT int GetMaxNumThreadsPerTB(int dev_id) {
+    return MaxNumThreadsPerTB[dev_id];
+  }
+
+  MGARDX_CONT size_t GetAvailableMemory(int dev_id) {
+    return AvailableMemory[dev_id];
+  }
+
+  MGARDX_CONT bool SupportCG(int dev_id) {
+    return SupportCooperativeGroups[dev_id];
+  }
+
+  MGARDX_CONT std::string GetDeviceName(int dev_id) {
+    return DeviceNames[dev_id];
+  }
+
+  MGARDX_CONT
+  ~DeviceSpecification() {
+    delete[] MaxSharedMemorySize;
+    delete[] WarpSize;
+    delete[] NumSMs;
+    delete[] ArchitectureGeneration;
+    delete[] MaxNumThreadsPerSM;
+    delete[] MaxNumThreadsPerTB;
+    delete[] AvailableMemory;
+    delete[] SupportCooperativeGroups;
+    delete[] DeviceNames;
+  }
+
+  int NumDevices;
+  int *MaxSharedMemorySize;
+  int *WarpSize;
+  int *NumSMs;
+  int *ArchitectureGeneration;
+  int *MaxNumThreadsPerSM;
+  int *MaxNumThreadsPerTB;
+  size_t *AvailableMemory;
+  bool *SupportCooperativeGroups;
+  std::string *DeviceNames;
+};
+
+template <> class DeviceQueues<SYCL> {
+public:
+  MGARDX_CONT
+  DeviceQueues() {
+    sycl::default_selector d_selector;
+    sycl::platform d_platform(d_selector);
+    std::vector<sycl::device> d_devices = d_platform.get_devices();
+    NumDevices = d_devices.size();
+    queues = new sycl::queue *[NumDevices];
+    for (SIZE d = 0; d < NumDevices; d++) {
+      queues[d] = new sycl::queue[MGARDX_NUM_QUEUES];
+      for (SIZE i = 0; i < MGARDX_NUM_QUEUES; i++) {
+        queues[d][i] = sycl::queue(d_devices[d], exception_handler);
+      }
+    }
+  }
+
+  MGARDX_CONT sycl::queue GetQueue(int dev_id, SIZE queue_id) {
+    return queues[dev_id][queue_id];
+  }
+
+  MGARDX_CONT void SyncQueue(int dev_id, SIZE queue_id) {
+    queues[dev_id][queue_id].wait();
+  }
+
+  MGARDX_CONT void SyncAllQueues(int dev_id) {
+    for (SIZE i = 0; i < MGARDX_NUM_QUEUES; i++) {
+      queues[dev_id][i].wait();
+    }
+  }
+
+  MGARDX_CONT
+  ~DeviceQueues() {
+    for (SIZE d = 0; d < NumDevices; d++) {
+      delete[] queues[d];
+    }
+    delete[] queues;
+    queues = NULL;
+  }
+
+  int NumDevices;
+  sycl::queue **queues = NULL;
+};
+
+template <> class DeviceRuntime<SYCL> {
+public:
+  MGARDX_CONT
+  DeviceRuntime() {}
+
+  MGARDX_CONT static int GetDeviceCount() { return DeviceSpecs.NumDevices; }
+
+  MGARDX_CONT static void SelectDevice(SIZE dev_id) { curr_dev_id = dev_id; }
+
+  MGARDX_CONT static sycl::queue GetQueue(SIZE queue_id) {
+    return queues.GetQueue(curr_dev_id, queue_id);
+  }
+
+  MGARDX_CONT static void SyncQueue(SIZE queue_id) {
+    queues.SyncQueue(curr_dev_id, queue_id);
+  }
+
+  MGARDX_CONT static void SyncAllQueues() { queues.SyncAllQueues(curr_dev_id); }
+
+  MGARDX_CONT static void SyncDevice() { queues.SyncAllQueues(curr_dev_id); }
+
+  MGARDX_CONT static std::string GetDeviceName() {
+    return DeviceSpecs.GetDeviceName(curr_dev_id);
+  }
+
+  MGARDX_CONT static int GetMaxSharedMemorySize() {
+    return DeviceSpecs.GetMaxSharedMemorySize(curr_dev_id);
+  }
+
+  MGARDX_CONT static int GetWarpSize() {
+    return DeviceSpecs.GetWarpSize(curr_dev_id);
+  }
+
+  MGARDX_CONT static int GetNumSMs() {
+    return DeviceSpecs.GetNumSMs(curr_dev_id);
+  }
+
+  MGARDX_CONT static int GetArchitectureGeneration() {
+    return DeviceSpecs.GetArchitectureGeneration(curr_dev_id);
+  }
+
+  MGARDX_CONT static int GetMaxNumThreadsPerSM() {
+    return DeviceSpecs.GetMaxNumThreadsPerSM(curr_dev_id);
+  }
+
+  MGARDX_CONT static int GetMaxNumThreadsPerTB() {
+    return DeviceSpecs.GetMaxNumThreadsPerTB(curr_dev_id);
+  }
+
+  MGARDX_CONT static size_t GetAvailableMemory() {
+    return DeviceSpecs.GetAvailableMemory(curr_dev_id);
+  }
+
+  MGARDX_CONT static bool SupportCG() {
+    return DeviceSpecs.SupportCG(curr_dev_id);
+  }
+
+  template <typename FunctorType>
+  MGARDX_CONT static int
+  GetOccupancyMaxActiveBlocksPerSM(FunctorType functor, int blockSize,
+                                   size_t dynamicSMemSize) {
+    return 32;
+  }
+
+  template <typename FunctorType>
+  MGARDX_CONT static void SetMaxDynamicSharedMemorySize(FunctorType functor,
+                                                        int maxbytes) {
+    // do nothing
+  }
+
+  MGARDX_CONT
+  ~DeviceRuntime() {}
+
+  static int curr_dev_id;
+  static DeviceQueues<SYCL> queues;
+  static bool SyncAllKernelsAndCheckErrors;
+  static DeviceSpecification<SYCL> DeviceSpecs;
+  static bool TimingAllKernels;
+  static bool PrintKernelConfig;
+};
+
+template <> class MemoryManager<SYCL> {
+public:
+  MGARDX_CONT
+  MemoryManager(){};
+
+  template <typename T>
+  MGARDX_CONT static void Malloc1D(T *&ptr, SIZE n, int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    using converted_T =
+        typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
+    ptr = malloc_device<converted_T>(n, q);
+  }
+
+  template <typename T>
+  MGARDX_CONT static void MallocND(T *&ptr, SIZE n1, SIZE n2, SIZE &ld,
+                                   int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    using converted_T =
+        typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
+    ptr = malloc_device<converted_T>(n1 * n2, q);
+    ld = n1;
+  }
+
+  template <typename T>
+  MGARDX_CONT static void MallocManaged1D(T *&ptr, SIZE n, int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    using converted_T =
+        typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
+    ptr = malloc_shared<converted_T>(n, q);
+  }
+
+  template <typename T> MGARDX_CONT static void Free(T *ptr) {
+    // printf("MemoryManager.Free(%llu)\n", ptr);
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(0);
+    if (ptr == NULL)
+      return;
+    sycl::free(ptr, q);
+  }
+
+  template <typename T>
+  MGARDX_CONT static void Copy1D(T *dst_ptr, const T *src_ptr, SIZE n,
+                                 int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    using converted_T =
+        typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
+    q.memcpy((converted_T *)dst_ptr, (converted_T *)src_ptr,
+             n * sizeof(converted_T));
+  }
+
+  template <typename T>
+  MGARDX_CONT static void CopyND(T *dst_ptr, SIZE dst_ld, const T *src_ptr,
+                                 SIZE src_ld, SIZE n1, SIZE n2, int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    using converted_T =
+        typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
+    q.memcpy((converted_T *)dst_ptr, (converted_T *)src_ptr,
+             n1 * n2 * sizeof(converted_T));
+  }
+
+  template <typename T>
+  MGARDX_CONT static void MallocHost(T *&ptr, SIZE n, int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    using converted_T =
+        typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
+    ptr = malloc_host<converted_T>(n, q);
+  }
+
+  template <typename T> MGARDX_CONT static void FreeHost(T *ptr) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(0);
+    if (ptr == NULL)
+      return;
+    sycl::free(ptr, q);
+  }
+
+  template <typename T>
+  MGARDX_CONT static void Memset1D(T *ptr, SIZE n, int value, int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    using converted_T =
+        typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
+    q.memset((converted_T *)ptr, value, n * sizeof(converted_T));
+  }
+
+  template <typename T>
+  MGARDX_CONT static void MemsetND(T *ptr, SIZE ld, SIZE n1, SIZE n2, int value,
+                                   int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    using converted_T =
+        typename std::conditional<std::is_same<T, void>::value, Byte, T>::type;
+    q.memset((converted_T *)ptr, value, n1 * n2 * sizeof(converted_T));
+  }
+
+  template <typename T> MGARDX_CONT static bool IsDevicePointer(T *ptr) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(0);
+    return sycl::get_pointer_type(ptr, q.get_context()) ==
+           sycl::usm::alloc::device;
+  }
+
+  static bool ReduceMemoryFootprint;
+};
+
+template <typename FunctorType> class Kernel {
+public:
+  Kernel(FunctorType functor, LocalMemory localAccess)
+      : functor(functor), localAccess(localAccess) {}
+  void operator()(sycl::nd_item<3> i) const {
+    FunctorType my_functor = functor;
+    sycl::local_ptr<Byte> l_ptr = localAccess.get_pointer();
+    Byte *shared_memory = l_ptr.get();
+    my_functor.Init(
+        i.get_group_range(2), i.get_group_range(1), i.get_group_range(0),
+        i.get_global_range(2) / i.get_group_range(2),
+        i.get_global_range(1) / i.get_group_range(1),
+        i.get_global_range(0) / i.get_group_range(0), i.get_group().get_id(2),
+        i.get_group().get_id(1), i.get_group().get_id(0), i.get_local_id(2),
+        i.get_local_id(1), i.get_local_id(0), shared_memory);
+
+    my_functor.Operation1();
+    i.barrier();
+    my_functor.Operation2();
+    i.barrier();
+    my_functor.Operation3();
+    i.barrier();
+    my_functor.Operation4();
+    i.barrier();
+    my_functor.Operation5();
+    i.barrier();
+    my_functor.Operation6();
+    i.barrier();
+    my_functor.Operation7();
+    i.barrier();
+    my_functor.Operation8();
+    i.barrier();
+    my_functor.Operation9();
+    i.barrier();
+    my_functor.Operation10();
+  }
+
+private:
+  FunctorType functor;
+  LocalMemory localAccess;
+};
+
+template <typename FunctorType> class IterKernel {
+public:
+  IterKernel(FunctorType functor, LocalMemory localAccess)
+      : functor(functor), localAccess(localAccess) {}
+  void operator()(sycl::nd_item<3> i) const {
+    FunctorType my_functor = functor;
+    Byte *shared_memory = localAccess.get_pointer().get();
+    my_functor.Init(
+        i.get_group_range(2), i.get_group_range(1), i.get_group_range(0),
+        i.get_global_range(2) / i.get_group_range(2),
+        i.get_global_range(1) / i.get_group_range(1),
+        i.get_global_range(0) / i.get_group_range(0), i.get_group().get_id(2),
+        i.get_group().get_id(1), i.get_group().get_id(0), i.get_local_id(2),
+        i.get_local_id(1), i.get_local_id(0), shared_memory);
+
+    my_functor.Operation1();
+    i.barrier();
+
+    my_functor.Operation2();
+    i.barrier();
+
+    while (my_functor.LoopCondition1()) {
+      my_functor.Operation3();
+      i.barrier();
+      my_functor.Operation4();
+      i.barrier();
+      my_functor.Operation5();
+      i.barrier();
+      my_functor.Operation6();
+      i.barrier();
+    }
+
+    my_functor.Operation7();
+    i.barrier();
+    my_functor.Operation8();
+    i.barrier();
+    my_functor.Operation9();
+    i.barrier();
+    my_functor.Operation10();
+    i.barrier();
+
+    while (my_functor.LoopCondition2()) {
+      my_functor.Operation11();
+      i.barrier();
+      my_functor.Operation12();
+      i.barrier();
+      my_functor.Operation13();
+      i.barrier();
+      my_functor.Operation14();
+      i.barrier();
+    }
+
+    my_functor.Operation15();
+    i.barrier();
+    my_functor.Operation16();
+    i.barrier();
+    my_functor.Operation17();
+    i.barrier();
+  }
+
+private:
+  FunctorType functor;
+  LocalMemory localAccess;
+};
+
+#define SINGLE_KERNEL(OPERATION)                                               \
+  template <typename FunctorType> class Single_##OPERATION##_Kernel {          \
+  public:                                                                      \
+    Single_##OPERATION##_Kernel(FunctorType functor, LocalMemory localAccess)  \
+        : functor(functor), localAccess(localAccess) {}                        \
+    void operator()(sycl::nd_item<3> i) const {                                \
+      FunctorType my_functor = functor;                                        \
+      Byte *shared_memory = localAccess.get_pointer().get();                   \
+      my_functor.Init(i.get_group_range(2), i.get_group_range(1),              \
+                      i.get_group_range(0),                                    \
+                      i.get_global_range(2) / i.get_group_range(2),            \
+                      i.get_global_range(1) / i.get_group_range(1),            \
+                      i.get_global_range(0) / i.get_group_range(0),            \
+                      i.get_group().get_id(2), i.get_group().get_id(1),        \
+                      i.get_group().get_id(0), i.get_local_id(2),              \
+                      i.get_local_id(1), i.get_local_id(0), shared_memory);    \
+      my_functor.OPERATION();                                                  \
+      i.barrier();                                                             \
+    }                                                                          \
+                                                                               \
+  private:                                                                     \
+    FunctorType functor;                                                       \
+    LocalMemory localAccess;                                                   \
+  };
+
+SINGLE_KERNEL(Operation1);
+SINGLE_KERNEL(Operation2);
+SINGLE_KERNEL(Operation3);
+SINGLE_KERNEL(Operation4);
+SINGLE_KERNEL(Operation5);
+SINGLE_KERNEL(Operation6);
+SINGLE_KERNEL(Operation7);
+SINGLE_KERNEL(Operation8);
+SINGLE_KERNEL(Operation9);
+SINGLE_KERNEL(Operation10);
+SINGLE_KERNEL(Operation11);
+SINGLE_KERNEL(Operation12);
+SINGLE_KERNEL(Operation13);
+SINGLE_KERNEL(Operation14);
+
+#undef SINGLE_KERNEL
+
+template <typename FunctorType> class ParallelMergeKernel {
+public:
+  ParallelMergeKernel(FunctorType functor, LocalMemory localAccess)
+      : functor(functor), localAccess(localAccess) {}
+  void operator()(sycl::nd_item<3> i) const {
+    FunctorType my_functor = functor;
+    Byte *shared_memory = localAccess.get_pointer().get();
+    my_functor.Init(
+        i.get_group_range(2), i.get_group_range(1), i.get_group_range(0),
+        i.get_global_range(2) / i.get_group_range(2),
+        i.get_global_range(1) / i.get_group_range(1),
+        i.get_global_range(0) / i.get_group_range(0), i.get_group().get_id(2),
+        i.get_group().get_id(1), i.get_group().get_id(0), i.get_local_id(2),
+        i.get_local_id(1), i.get_local_id(0), shared_memory);
+
+    my_functor.Operation5();
+    i.barrier();
+    while (my_functor.LoopCondition2()) {
+      my_functor.Operation6();
+      i.barrier();
+      my_functor.Operation7();
+      i.barrier();
+      my_functor.Operation8();
+      i.barrier();
+    }
+    my_functor.Operation9();
+  }
+
+private:
+  FunctorType functor;
+  LocalMemory localAccess;
+};
+
+template <typename Task> void HuffmanCLCustomizedNoCGKernel(Task task) {
+  // std::cout << "calling HuffmanCLCustomizedNoCGKernel\n";
+  sycl::range global_threads(task.GetBlockDimX() * task.GetGridDimX(),
+                             task.GetBlockDimY() * task.GetGridDimY(),
+                             task.GetBlockDimZ() * task.GetGridDimZ());
+
+  sycl::range local_threads(task.GetBlockDimX(), task.GetBlockDimY(),
+                            task.GetBlockDimZ());
+
+  size_t sm_size = task.GetSharedMemorySize();
+  if (sm_size == 0)
+    sm_size = 1; // avoid -51 (CL_INVALID_ARG_SIZE) error
+
+  sycl::queue q = DeviceRuntime<SYCL>::GetQueue(task.GetQueueIdx());
+
+  // std::cout << "calling Single_Operation1_Kernel\n";
+  q.submit([&](sycl::handler &h) {
+    LocalMemory localAccess{sm_size, h};
+    Single_Operation1_Kernel kernel(task.GetFunctor(), localAccess);
+    h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+  });
+  DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+  // std::cout << "calling LoopCondition1\n";
+  while (task.GetFunctor().LoopCondition1()) {
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation2_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation2_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation3_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation3_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation4_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation4_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling BranchCondition1\n";
+    if (task.GetFunctor().BranchCondition1()) {
+      DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+      // std::cout << "calling ParallelMergeKernel\n";
+      q.submit([&](sycl::handler &h) {
+        LocalMemory localAccess{sm_size, h};
+        ParallelMergeKernel kernel(task.GetFunctor(), localAccess);
+        h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+      });
+      DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+      // std::cout << "calling Single_Operation10_Kernel\n";
+      q.submit([&](sycl::handler &h) {
+        LocalMemory localAccess{sm_size, h};
+        Single_Operation10_Kernel kernel(task.GetFunctor(), localAccess);
+        h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+      });
+      DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+    }
+
+    // std::cout << "calling Single_Operation11_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation11_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation12_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation12_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation13_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation13_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+    // std::cout << "calling Single_Operation14_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation14_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+  }
+}
+
+template <typename Task> void HuffmanCWCustomizedNoCGKernel(Task task) {
+  // std::cout << "calling HuffmanCWCustomizedNoCGKernel\n";
+  sycl::range global_threads(task.GetBlockDimX() * task.GetGridDimX(),
+                             task.GetBlockDimY() * task.GetGridDimY(),
+                             task.GetBlockDimZ() * task.GetGridDimZ());
+
+  sycl::range local_threads(task.GetBlockDimX(), task.GetBlockDimY(),
+                            task.GetBlockDimZ());
+
+  size_t sm_size = task.GetSharedMemorySize();
+  if (sm_size == 0)
+    sm_size = 1; // avoid -51 (CL_INVALID_ARG_SIZE) error
+
+  sycl::queue q = DeviceRuntime<SYCL>::GetQueue(task.GetQueueIdx());
+
+  // std::cout << "calling Single_Operation1_Kernel\n";
+  q.submit([&](sycl::handler &h) {
+    LocalMemory localAccess{sm_size, h};
+    Single_Operation1_Kernel kernel(task.GetFunctor(), localAccess);
+    h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+  });
+  DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+  // std::cout << "calling Single_Operation2_Kernel\n";
+  q.submit([&](sycl::handler &h) {
+    LocalMemory localAccess{sm_size, h};
+    Single_Operation2_Kernel kernel(task.GetFunctor(), localAccess);
+    h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+  });
+  DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+  // std::cout << "calling Single_Operation3_Kernel\n";
+  q.submit([&](sycl::handler &h) {
+    LocalMemory localAccess{sm_size, h};
+    Single_Operation3_Kernel kernel(task.GetFunctor(), localAccess);
+    h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+  });
+  DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+  // std::cout << "calling LoopCondition1\n";
+  while (task.GetFunctor().LoopCondition1()) {
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation4_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation4_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation5_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation5_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation6_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation6_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation7_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation7_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+    // std::cout << "calling Single_Operation8_Kernel\n";
+    q.submit([&](sycl::handler &h) {
+      LocalMemory localAccess{sm_size, h};
+      Single_Operation8_Kernel kernel(task.GetFunctor(), localAccess);
+      h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+    });
+    DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+  }
+
+  // std::cout << "calling Single_Operation9_Kernel\n";
+  q.submit([&](sycl::handler &h) {
+    LocalMemory localAccess{sm_size, h};
+    Single_Operation9_Kernel kernel(task.GetFunctor(), localAccess);
+    h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+  });
+  DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+
+  // std::cout << "calling Single_Operation10_Kernel\n";
+  q.submit([&](sycl::handler &h) {
+    LocalMemory localAccess{sm_size, h};
+    Single_Operation10_Kernel kernel(task.GetFunctor(), localAccess);
+    h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+  });
+  DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+}
+
+// template <typename FunctorType
+
+template <typename TaskType> class DeviceAdapter<TaskType, SYCL> {
+public:
+  // inline constexpr bool sycl::is_device_copyable_v<TaskType> = true;
+
+  MGARDX_CONT
+  DeviceAdapter(){};
+
+  MGARDX_CONT
+  int IsResourceEnough(TaskType &task) {
+    if (task.GetBlockDimX() * task.GetBlockDimY() * task.GetBlockDimZ() >
+        DeviceRuntime<SYCL>::GetMaxNumThreadsPerTB()) {
+      return THREADBLOCK_TOO_LARGE;
+    }
+    if (task.GetSharedMemorySize() >
+        DeviceRuntime<SYCL>::GetMaxSharedMemorySize()) {
+      return SHARED_MEMORY_TOO_LARGE;
+    }
+    return RESOURCE_ENOUGH;
+  }
+
+  MGARDX_CONT
+  ExecutionReturn Execute(TaskType &task) {
+
+    sycl::range global_threads(task.GetBlockDimX() * task.GetGridDimX(),
+                               task.GetBlockDimY() * task.GetGridDimY(),
+                               task.GetBlockDimZ() * task.GetGridDimZ());
+
+    sycl::range local_threads(task.GetBlockDimX(), task.GetBlockDimY(),
+                              task.GetBlockDimZ());
+
+    size_t sm_size = task.GetSharedMemorySize();
+    if (sm_size == 0)
+      sm_size = 1; // avoid -51 (CL_INVALID_ARG_SIZE) error
+
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(task.GetQueueIdx());
+
+    if (DeviceRuntime<SYCL>::PrintKernelConfig) {
+      std::cout << log::log_info << task.GetFunctorName() << ": <"
+                << task.GetBlockDimX() << ", " << task.GetBlockDimY() << ", "
+                << task.GetBlockDimZ() << "> <" << task.GetGridDimX() << ", "
+                << task.GetGridDimY() << ", " << task.GetGridDimZ() << ">\n";
+    }
+
+    ExecutionReturn ret;
+    if (IsResourceEnough(task) != RESOURCE_ENOUGH) {
+      if (DeviceRuntime<SYCL>::PrintKernelConfig) {
+        if (IsResourceEnough(task) == THREADBLOCK_TOO_LARGE) {
+          std::cout << log::log_info << "threadblock too large.\n";
+        }
+        if (IsResourceEnough(task) == SHARED_MEMORY_TOO_LARGE) {
+          std::cout << log::log_info << "shared memory too large.\n";
+        }
+      }
+      ret.success = false;
+      ret.execution_time = std::numeric_limits<double>::max();
+      return ret;
+    }
+
+    Timer timer;
+    if (DeviceRuntime<SYCL>::TimingAllKernels ||
+        AutoTuner<SYCL>::ProfileKernels) {
+      DeviceRuntime<SYCL>::SyncDevice();
+      timer.start();
+    }
+
+    // if constexpr evaluate at compile time otherwise this does not compile
+    if constexpr (std::is_base_of<Functor<SYCL>,
+                                  typename TaskType::Functor>::value) {
+      q.submit([&](sycl::handler &h) {
+        LocalMemory localAccess{sm_size, h};
+        Kernel kernel(task.GetFunctor(), localAccess);
+        h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+      });
+    } else if constexpr (std::is_base_of<IterFunctor<SYCL>,
+                                         typename TaskType::Functor>::value) {
+      q.submit([&](sycl::handler &h) {
+        LocalMemory localAccess{sm_size, h};
+        IterKernel kernel(task.GetFunctor(), localAccess);
+        h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel);
+      });
+    } else if constexpr (std::is_base_of<HuffmanCLCustomizedFunctor<SYCL>,
+                                         typename TaskType::Functor>::value) {
+      HuffmanCLCustomizedNoCGKernel(task);
+    } else if constexpr (std::is_base_of<HuffmanCWCustomizedFunctor<SYCL>,
+                                         typename TaskType::Functor>::value) {
+      HuffmanCWCustomizedNoCGKernel(task);
+    }
+    if (DeviceRuntime<SYCL>::SyncAllKernelsAndCheckErrors) {
+      DeviceRuntime<SYCL>::SyncQueue(task.GetQueueIdx());
+    }
+
+    if (DeviceRuntime<SYCL>::TimingAllKernels ||
+        AutoTuner<SYCL>::ProfileKernels) {
+      DeviceRuntime<SYCL>::SyncDevice();
+      timer.end();
+      if (DeviceRuntime<SYCL>::TimingAllKernels) {
+        timer.print(task.GetFunctorName());
+      }
+      if (AutoTuner<SYCL>::ProfileKernels) {
+        ret.success = true;
+        ret.execution_time = timer.get();
+      }
+    }
+    return ret;
+  }
+};
+
+template <typename T> struct AbsMaxOp {
+  T operator()(const T &a, const T &b) const {
+    return (fabs(b) > fabs(a)) ? fabs(b) : fabs(a);
+  }
+};
+
+template <typename T> struct SquareOp {
+  T operator()(const T &a) const { return a * a; }
+};
+
+template <> class DeviceCollective<SYCL> {
+public:
+  MGARDX_CONT
+  DeviceCollective(){};
+
+  template <typename T>
+  MGARDX_CONT static void Sum(SIZE n, SubArray<1, T, SYCL> &v,
+                              SubArray<1, T, SYCL> &result, int queue_idx) {
+
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    q.submit([&](sycl::handler &h) {
+      T *res = result.data();
+      T *input = v.data();
+      sycl::range global{n};
+      sycl::range local{256};
+      h.parallel_for(sycl::nd_range{global, local},
+                     sycl::reduction(res, (T)0, sycl::plus<T>()),
+                     [=](sycl::nd_item<1> it, auto &res) {
+                       size_t i = it.get_global_id(0);
+                       res.combine(input[i]);
+                     });
+    });
+    DeviceRuntime<SYCL>::SyncDevice();
+  }
+
+  template <typename T>
+  MGARDX_CONT static void AbsMax(SIZE n, SubArray<1, T, SYCL> &v,
+                                 SubArray<1, T, SYCL> &result, int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    q.submit([&](sycl::handler &h) {
+      T *res = result.data();
+      T *input = v.data();
+      sycl::range global{n};
+      sycl::range local{4};
+      h.parallel_for(sycl::nd_range{global, local},
+                     sycl::reduction(res, (T)0, AbsMaxOp<T>()),
+                     [=](sycl::nd_item<1> it, auto &res) {
+                       size_t i = it.get_global_id(0);
+                       res.combine(input[i]);
+                     });
+    });
+    DeviceRuntime<SYCL>::SyncDevice();
+  }
+
+  template <typename T>
+  MGARDX_CONT static void SquareSum(SIZE n, SubArray<1, T, SYCL> &v,
+                                    SubArray<1, T, SYCL> &result,
+                                    int queue_idx) {
+    sycl::queue q = DeviceRuntime<SYCL>::GetQueue(queue_idx);
+    q.submit([&](sycl::handler &h) {
+      T *res = result.data();
+      T *input = v.data();
+      sycl::range global{n};
+      sycl::range local{256};
+      h.parallel_for(sycl::nd_range{global, local},
+                     sycl::reduction(res, (T)0, sycl::plus<T>()),
+                     [=](sycl::nd_item<1> it, auto &res) {
+                       size_t i = it.get_global_id(0);
+                       res.combine(input[i] * input[i]);
+                     });
+    });
+    DeviceRuntime<SYCL>::SyncDevice();
+  }
+
+  template <typename T>
+  MGARDX_CONT static void ScanSumInclusive(SIZE n, SubArray<1, T, SYCL> &v,
+                                           SubArray<1, T, SYCL> &result,
+                                           int queue_idx) {}
+
+  template <typename T>
+  MGARDX_CONT static void ScanSumExclusive(SIZE n, SubArray<1, T, SYCL> &v,
+                                           SubArray<1, T, SYCL> &result,
+                                           int queue_idx) {}
+
+  template <typename T>
+  MGARDX_CONT static void ScanSumExtended(SIZE n, SubArray<1, T, SYCL> &v,
+                                          SubArray<1, T, SYCL> &result,
+                                          int queue_idx) {}
+
+  template <typename KeyT, typename ValueT>
+  MGARDX_CONT static void SortByKey(SIZE n, SubArray<1, KeyT, SYCL> &keys,
+                                    SubArray<1, ValueT, SYCL> &values,
+                                    int queue_idx) {
+    KeyT *keys_array = new KeyT[n];
+    ValueT *values_array = new ValueT[n];
+    MemoryManager<SYCL>::Copy1D(keys_array, keys.data(), n, 0);
+    MemoryManager<SYCL>::Copy1D(values_array, values.data(), n, 0);
+    DeviceRuntime<SYCL>::SyncQueue(0);
+
+    std::vector<std::pair<KeyT, ValueT>> data(n);
+    for (SIZE i = 0; i < n; ++i) {
+      data[i] = std::pair<KeyT, ValueT>(keys_array[i], values_array[i]);
+    }
+    std::stable_sort(data.begin(), data.end(),
+                     KeyValueComparator<KeyT, ValueT>{});
+    for (SIZE i = 0; i < n; ++i) {
+      keys_array[i] = data[i].first;
+      values_array[i] = data[i].second;
+    }
+    MemoryManager<SYCL>::Copy1D(keys.data(), keys_array, n, 0);
+    MemoryManager<SYCL>::Copy1D(values.data(), values_array, n, 0);
+    DeviceRuntime<SYCL>::SyncDevice();
+    delete[] keys_array;
+    delete[] values_array;
+  }
+
+  template <typename KeyT, typename ValueT, typename BinaryOpType>
+  MGARDX_CONT static void
+  ScanOpInclusiveByKey(SubArray<1, SIZE, SYCL> &key,
+                       SubArray<1, ValueT, SYCL> &v,
+                       SubArray<1, ValueT, SYCL> &result, int queue_idx) {}
+};
+
+} // namespace mgard_x
+#endif
\ No newline at end of file
diff --git a/include/mgard-x/RuntimeX/RuntimeX.h b/include/mgard-x/RuntimeX/RuntimeX.h
index e22f930c42..e0b575145b 100644
--- a/include/mgard-x/RuntimeX/RuntimeX.h
+++ b/include/mgard-x/RuntimeX/RuntimeX.h
@@ -14,11 +14,8 @@
 #include "AutoTuners/AutoTuner.h"
 #include "Tasks/Task.h"
 
-#if MGARD_ENABLE_SERIAL
-#ifdef MGARDX_COMPILE_SERIAL
+// Serial backend should be always available
 #include "DeviceAdapters/DeviceAdapterSerial.h"
-#endif
-#endif
 
 #if MGARD_ENABLE_CUDA
 #ifdef MGARDX_COMPILE_CUDA
@@ -32,6 +29,12 @@
 #endif
 #endif
 
+#if MGARD_ENABLE_SYCL
+#ifdef MGARDX_COMPILE_SYCL
+#include "DeviceAdapters/DeviceAdapterSycl.h"
+#endif
+#endif
+
 #if RUNTIME_X_ENABLE_KOKKOS
 #include "DeviceAdapters/DeviceAdapterKokkos.h"
 #endif
diff --git a/include/mgard-x/RuntimeX/Utilities/SubArrayPrinter.hpp b/include/mgard-x/RuntimeX/Utilities/SubArrayPrinter.hpp
index e4c5717260..02e42a4456 100644
--- a/include/mgard-x/RuntimeX/Utilities/SubArrayPrinter.hpp
+++ b/include/mgard-x/RuntimeX/Utilities/SubArrayPrinter.hpp
@@ -53,6 +53,7 @@ void PrintSubarray(std::string name, SubArrayType subArray) {
   //                         ncol, subArray.data(), subArray.lddv1 * sizeof(T),
   //                         nfib * sizeof(T), subArray.lddv2, nfib * sizeof(T),
   //                         ncol, nrow, D2H, 0);
+  DeviceRuntime<DeviceType>::SyncQueue(0);
   for (SIZE i = 0; i < nrow; i++) {
     MemoryManager<DeviceType>::CopyND(
         v + ncol * nfib * i, nfib,
@@ -251,6 +252,48 @@ void CompareSubarray(std::string name, SubArrayType1 subArray1,
   delete[] v2;
 }
 
+template <typename SubArrayType>
+void CompareSubarray4D(SubArrayType subArray1, SubArrayType subArray2) {
+  if (SubArrayType::NumDims != 4) {
+    std::cout << log::log_err
+              << "CompareSubarray4D expects 4D subarray type.\n";
+    exit(-1);
+  }
+  if (subArray1.getShape(3) != subArray2.getShape(3)) {
+    std::cout << log::log_err << "CompareSubarray4D mismatch 4D size.\n";
+    exit(-1);
+  }
+
+  using T = typename SubArrayType::DataType;
+  SIZE idx[4] = {0, 0, 0, 0};
+  for (SIZE i = 0; i < subArray1.getShape(3); i++) {
+    idx[3] = i;
+    SubArrayType temp1 = subArray1;
+    SubArrayType temp2 = subArray2;
+    temp1.offset(3, i);
+    temp2.offset(3, i);
+    CompareSubarray("4D = " + std::to_string(i), temp1.Slice3D(0, 1, 2),
+                    temp2.Slice3D(0, 1, 2));
+  }
+}
+
+template <typename SubArrayType>
+void PrintSubarray4D(std::string name, SubArrayType subArray1) {
+  if (SubArrayType::NumDims != 4) {
+    std::cout << log::log_err << "PrintSubarray4D expects 4D subarray type.\n";
+    exit(-1);
+  }
+  std::cout << name << "\n";
+  using T = typename SubArrayType::DataType;
+  SIZE idx[4] = {0, 0, 0, 0};
+  for (SIZE i = 0; i < subArray1.getShape(3); i++) {
+    idx[3] = i;
+    SubArrayType temp1 = subArray1;
+    temp1.offset(3, i);
+    PrintSubarray("i = " + std::to_string(i), temp1.Slice3D(0, 1, 2));
+  }
+}
+
 // print 3D CPU
 template <typename T>
 void verify_matrix(SIZE nrow, SIZE ncol, SIZE nfib, T *v, SIZE ldv1, SIZE ldv2,
diff --git a/include/mgard-x/Testing/ReorderToolsCPU.h b/include/mgard-x/Testing/ReorderToolsCPU.h
deleted file mode 100644
index a047b3166d..0000000000
--- a/include/mgard-x/Testing/ReorderToolsCPU.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef REORDERTOOLSCPU_H
-#define REORDERTOOLSCPU_H
-
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "TensorMeshHierarchy.hpp"
-#include "shuffle.hpp"
-
-namespace mgard {
-
-template <std::size_t D, typename T>
-void ReorderCPU(TensorMeshHierarchy<D, T> &hierarchy, T *input, T *output);
-template <std::size_t D, typename T>
-void ReverseReorderCPU(TensorMeshHierarchy<D, T> &hierarchy, T *input,
-                       T *output);
-
-} // namespace mgard
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/Testing/ReorderToolsCPU.hpp b/include/mgard-x/Testing/ReorderToolsCPU.hpp
deleted file mode 100644
index 11777d0470..0000000000
--- a/include/mgard-x/Testing/ReorderToolsCPU.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef REORDERTOOLSGPU_HPP
-#define REORDERTOOLSGPU_HPP
-
-#include "TensorMeshHierarchy.hpp"
-#include "shuffle.hpp"
-
-namespace mgard {
-
-template <std::size_t D, typename T>
-void ReorderCPU(TensorMeshHierarchy<D, T> &hierarchy, T *input, T *output) {
-  shuffle(hierarchy, input, output);
-}
-
-template <std::size_t D, typename T>
-void ReverseReorderCPU(TensorMeshHierarchy<D, T> &hierarchy, T *input,
-                       T *output) {
-  unshuffle(hierarchy, input, output);
-}
-} // namespace mgard
-
-namespace mgard {
-#define KERNELS(D, T)                                                          \
-  template void ReorderCPU<D, T>(TensorMeshHierarchy<D, T> & hierarchy,        \
-                                 T * input, T * output);                       \
-  template void ReverseReorderCPU<D, T>(TensorMeshHierarchy<D, T> & hierarchy, \
-                                        T * input, T * output);
-
-KERNELS(1, double)
-KERNELS(1, float)
-KERNELS(2, double)
-KERNELS(2, float)
-KERNELS(3, double)
-KERNELS(3, float)
-KERNELS(4, double)
-KERNELS(4, float)
-KERNELS(5, double)
-KERNELS(5, float)
-
-#undef KERNELS
-} // namespace mgard
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/Testing/ReorderToolsGPU.h b/include/mgard-x/Testing/ReorderToolsGPU.h
deleted file mode 100644
index 6b4220c411..0000000000
--- a/include/mgard-x/Testing/ReorderToolsGPU.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef REORDERTOOLSGPU_H
-#define REORDERTOOLSGPU_H
-
-#include "../Common.h"
-
-namespace mgard_x {
-
-template <DIM D, typename T>
-void ReorderGPU(Handle<D, T> &handle, SubArray<D, T, CUDA> dinput,
-                SubArray<D, T, CUDA> &doutput, int l_target, int queue_idx);
-template <DIM D, typename T>
-void ReverseReorderGPU(Handle<D, T> &handle, SubArray<D, T, CUDA> dinput,
-                       SubArray<D, T, CUDA> &doutput, int l_target,
-                       int queue_idx);
-
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/Testing/ReorderToolsGPU.hpp b/include/mgard-x/Testing/ReorderToolsGPU.hpp
deleted file mode 100644
index a0d9fec569..0000000000
--- a/include/mgard-x/Testing/ReorderToolsGPU.hpp
+++ /dev/null
@@ -1,337 +0,0 @@
-#ifndef REORDERTOOLSGPU_HPP
-#define REORDERTOOLSGPU_HPP
-
-#include "../DataRefactoring.h"
-#include "../GridProcessingKernel.h"
-#include "../GridProcessingKernel3D.h"
-#include "../LevelwiseProcessingKernel.h"
-#include "ReorderToolsGPU.h"
-
-#include "../CommonInternal.h"
-
-namespace mgard_x {
-
-template <DIM D, typename T>
-void ReorderGPU(Handle<D, T> &handle, SubArray<D, T, CUDA> dinput,
-                SubArray<D, T, CUDA> &doutput, int l_target, int queue_idx) {
-
-  SubArray<D, T, CUDA> dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf,
-      dcoeff_rf, dcoeff_rc, dcoeff_rcf;
-
-  DIM curr_dims[3];
-  // handle.l_target = 1;
-  for (int l = 0; l < l_target; ++l) {
-    int range_l = std::min(6, (int)std::log2(handle.dofs[0][l]) - 1);
-    int range_lp1 = std::min(6, (int)std::log2(handle.dofs[0][l + 1]) - 1);
-    int unprocessed_idx = 0;
-    printf("reorder 1-3D\n");
-    curr_dims[0] = 0;
-    curr_dims[1] = 1;
-    curr_dims[2] = 2;
-    dinput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-    doutput.project(curr_dims[0], curr_dims[1],
-                    curr_dims[2]); // reuse input1 as temp space
-    calc_coeff_pointers(handle, curr_dims, l, doutput, dcoarse, dcoeff_f,
-                        dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                        dcoeff_rcf);
-    printf("done calc ptrs\n");
-    if (D <= 3) {
-      gpk_reo<D, D, T, false, false, 1>(
-          handle, handle.shapes_h[l], handle.shapes_d[l],
-          handle.shapes_d[l + 1], dinput.ldvs_d, doutput.ldvs_d,
-          handle.unprocessed_n[unprocessed_idx],
-          handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2],
-          curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l],
-          handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l],
-          dinput.dv, dinput.lddv1, dinput.lddv2, dcoarse.dv, dcoarse.lddv1,
-          dcoarse.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2,
-          // null, lddv1, lddv2,
-          queue_idx,
-          handle.auto_tuning_cc[handle.arch][handle.precision][range_l]);
-      printf("done reo\n");
-    } else {
-      gpk_reo<D, 3, T, false, false, 1>(
-          handle, handle.shapes_h[l], handle.shapes_d[l],
-          handle.shapes_d[l + 1], dinput.ldvs_d, doutput.ldvs_d,
-          handle.unprocessed_n[unprocessed_idx],
-          handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2],
-          curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l],
-          handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l],
-          dinput.dv, dinput.lddv1, dinput.lddv2, dcoarse.dv, dcoarse.lddv1,
-          dcoarse.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2,
-          // null, lddv1, lddv2,
-          queue_idx,
-          handle.auto_tuning_cc[handle.arch][handle.precision][range_l]);
-
-      for (DIM d = 3; d < D; d += 2) {
-        // copy back to input for reordering again
-        lwpk<D, T, COPY>(handle, handle.shapes_h[l], handle.shapes_d[l],
-                         doutput.dv, doutput.ldvs_d, dinput.dv, dinput.ldvs_d,
-                         queue_idx);
-        printf("reorder-restore %u-%uD\n", d + 1, d + 2);
-        curr_dims[0] = 0;
-        curr_dims[1] = d;
-        curr_dims[2] = d + 1;
-        dinput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-        doutput.project(curr_dims[0], curr_dims[1],
-                        curr_dims[2]); // reuse input1 as temp space
-        calc_coeff_pointers(handle, curr_dims, l, doutput, dcoarse, dcoeff_f,
-                            dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                            dcoeff_rcf);
-
-        if (D - d == 1) {
-          unprocessed_idx += 1;
-          gpk_reo<D, 2, T, false, false, 2>(
-              handle, handle.shapes_h[l], handle.shapes_d[l],
-              handle.shapes_d[l + 1], dinput.ldvs_d, doutput.ldvs_d,
-              handle.unprocessed_n[unprocessed_idx],
-              handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2],
-              curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l],
-              handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l],
-              dinput.dv, dinput.lddv1, dinput.lddv2, dcoarse.dv, dcoarse.lddv1,
-              dcoarse.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2,
-              // null, lddv1, lddv2,
-              queue_idx,
-              handle.auto_tuning_cc[handle.arch][handle.precision][range_l]);
-        } else {
-          unprocessed_idx += 2;
-          gpk_reo<D, 3, T, false, false, 2>(
-              handle, handle.shapes_h[l], handle.shapes_d[l],
-              handle.shapes_d[l + 1], dinput.ldvs_d, doutput.ldvs_d,
-              handle.unprocessed_n[unprocessed_idx],
-              handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2],
-              curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l],
-              handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l],
-              dinput.dv, dinput.lddv1, dinput.lddv2, dcoarse.dv, dcoarse.lddv1,
-              dcoarse.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2,
-              // null, lddv1, lddv2,
-              queue_idx,
-              handle.auto_tuning_cc[handle.arch][handle.precision][range_l]);
-        }
-      }
-    }
-  }
-}
-
-template <DIM D, typename T>
-void ReverseReorderGPU(Handle<D, T> &handle, SubArray<D, T, CUDA> dinput,
-                       SubArray<D, T, CUDA> &doutput, int l_target,
-                       int queue_idx) {
-
-  SubArray<D, T, CUDA> dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf,
-      dcoeff_rf, dcoeff_rc, dcoeff_rcf;
-
-  DIM curr_dims[3];
-  for (int l = 0; l < l_target; ++l) {
-    int range_l = std::min(6, (int)std::log2(handle.dofs[0][l]) - 1);
-    int range_lp1 = std::min(6, (int)std::log2(handle.dofs[0][l + 1]) - 1);
-    int unprocessed_idx = 0;
-    printf("reorder-restore 1-3D\n");
-    curr_dims[0] = 0;
-    curr_dims[1] = 1;
-    curr_dims[2] = 2;
-    dinput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-    doutput.project(curr_dims[0], curr_dims[1],
-                    curr_dims[2]); // reuse input1 as temp space
-    calc_coeff_pointers(handle, curr_dims, l, dinput, dcoarse, dcoeff_f,
-                        dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                        dcoeff_rcf);
-    if (D <= 3) {
-      gpk_rev<D, D, T, false, false, 1>(
-          handle, handle.shapes_h[l], handle.shapes_d[l],
-          handle.shapes_d[l + 1], doutput.ldvs_d, dinput.ldvs_d,
-          handle.unprocessed_n[unprocessed_idx],
-          handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2],
-          curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l],
-          handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l],
-          doutput.dv, doutput.lddv1, doutput.lddv2, dcoarse.dv, dcoarse.lddv1,
-          dcoarse.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2,
-          // null, lddv1, lddv2,
-          0, 0, 0, handle.dofs[curr_dims[2]][l], handle.dofs[curr_dims[1]][l],
-          handle.dofs[curr_dims[0]][l], queue_idx,
-          handle.auto_tuning_cc[handle.arch][handle.precision][range_l]);
-    } else {
-      gpk_rev<D, 3, T, false, false, 1>(
-          handle, handle.shapes_h[l], handle.shapes_d[l],
-          handle.shapes_d[l + 1], doutput.ldvs_d, dinput.ldvs_d,
-          handle.unprocessed_n[unprocessed_idx],
-          handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2],
-          curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l],
-          handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l],
-          doutput.dv, doutput.lddv1, doutput.lddv2, dcoarse.dv, dcoarse.lddv1,
-          dcoarse.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2,
-          // null, lddv1, lddv2,
-          dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2,
-          // null, lddv1, lddv2,
-          0, 0, 0, handle.dofs[curr_dims[2]][l], handle.dofs[curr_dims[1]][l],
-          handle.dofs[curr_dims[0]][l], queue_idx,
-          handle.auto_tuning_cc[handle.arch][handle.precision][range_l]);
-
-      for (DIM d = 3; d < D; d += 2) {
-        // copy back to input for reordering again
-        lwpk<D, T, COPY>(handle, handle.shapes_h[l], handle.shapes_d[l],
-                         doutput.dv, doutput.ldvs_d, dinput.dv, dinput.ldvs_d,
-                         queue_idx);
-
-        curr_dims[0] = 0;
-        curr_dims[1] = d;
-        curr_dims[2] = d + 1;
-        dinput.project(curr_dims[0], curr_dims[1], curr_dims[2]);
-        doutput.project(curr_dims[0], curr_dims[1],
-                        curr_dims[2]); // reuse input1 as temp space
-        calc_coeff_pointers(handle, curr_dims, l, dinput, dcoarse, dcoeff_f,
-                            dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc,
-                            dcoeff_rcf);
-
-        if (D - d == 1) {
-          printf("reorder-restore %u-%uD\n", d + 1, d + 1);
-          unprocessed_idx += 1;
-          gpk_rev<D, 2, T, false, false, 2>(
-              handle, handle.shapes_h[l], handle.shapes_d[l],
-              handle.shapes_d[l + 1], doutput.ldvs_d, dinput.ldvs_d,
-              handle.unprocessed_n[unprocessed_idx],
-              handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2],
-              curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l],
-              handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l],
-              doutput.dv, doutput.lddv1, doutput.lddv2, dcoarse.dv,
-              dcoarse.lddv1, dcoarse.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2,
-              // null, lddv1, lddv2,
-              0, 0, 0, handle.dofs[curr_dims[2]][l],
-              handle.dofs[curr_dims[1]][l], handle.dofs[curr_dims[0]][l],
-              queue_idx,
-              handle.auto_tuning_cc[handle.arch][handle.precision][range_l]);
-        } else {
-          printf("reorder-restore %u-%uD\n", d + 1, d + 2);
-          unprocessed_idx += 2;
-          gpk_rev<D, 3, T, false, false, 2>(
-              handle, handle.shapes_h[l], handle.shapes_d[l],
-              handle.shapes_d[l + 1], doutput.ldvs_d, dinput.ldvs_d,
-              handle.unprocessed_n[unprocessed_idx],
-              handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2],
-              curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l],
-              handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l],
-              doutput.dv, doutput.lddv1, doutput.lddv2, dcoarse.dv,
-              dcoarse.lddv1, dcoarse.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2,
-              // null, lddv1, lddv2,
-              dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2,
-              // null, lddv1, lddv2,
-              0, 0, 0, handle.dofs[curr_dims[2]][l],
-              handle.dofs[curr_dims[1]][l], handle.dofs[curr_dims[0]][l],
-              queue_idx,
-              handle.auto_tuning_cc[handle.arch][handle.precision][range_l]);
-        }
-      }
-    }
-  }
-}
-} // namespace mgard_x
-
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/Utilities/CMakeLists.txt b/include/mgard-x/Utilities/CMakeLists.txt
index 547372c92f..7105fe953f 100644
--- a/include/mgard-x/Utilities/CMakeLists.txt
+++ b/include/mgard-x/Utilities/CMakeLists.txt
@@ -1,5 +1,5 @@
 list(APPEND MGARD_X_HEADER
-    ${CMAKE_CURRENT_SOURCE_DIR}/CheckEndianess.h
     ${CMAKE_CURRENT_SOURCE_DIR}/ErrorCollector.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Types.h
     )
 set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE)
\ No newline at end of file
diff --git a/include/mgard-x/Utilities/CheckEndianess.h b/include/mgard-x/Utilities/CheckEndianess.h
deleted file mode 100644
index b610ee71de..0000000000
--- a/include/mgard-x/Utilities/CheckEndianess.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#ifndef MGARD_X_CHECK_ENDIANESS_H
-#define MGARD_X_CHECK_ENDIANESS_H
-
-#include "../Types.h"
-
-namespace mgard_x {
-enum endiness_type CheckEndianess();
-}
-#endif
\ No newline at end of file
diff --git a/include/mgard-x/Utilities/ErrorCalculator.h b/include/mgard-x/Utilities/ErrorCalculator.h
index b6f81748b5..9e0d3ef10a 100644
--- a/include/mgard-x/Utilities/ErrorCalculator.h
+++ b/include/mgard-x/Utilities/ErrorCalculator.h
@@ -11,7 +11,7 @@
 // #include "../../TensorMeshHierarchy.hpp"
 // #include "../../TensorNorms.hpp"
 // #include "../../shuffle.hpp"
-#include "../Types.h"
+#include "Types.h"
 
 namespace mgard_x {
 
diff --git a/include/mgard-x/Types.h b/include/mgard-x/Utilities/Types.h
similarity index 91%
rename from include/mgard-x/Types.h
rename to include/mgard-x/Utilities/Types.h
index cc06544472..d7990c71e9 100644
--- a/include/mgard-x/Types.h
+++ b/include/mgard-x/Utilities/Types.h
@@ -19,13 +19,13 @@ enum class decomposition_type : uint8_t { MultiDim, SingleDim };
 enum class processor_type : uint8_t {
   CPU,
   GPU_CUDA,
-  X_Serial,
+  X_SERIAL,
   X_CUDA,
   X_HIP,
   X_SYCL
 };
 
-enum class device_type : uint8_t { Auto, Serial, CUDA, HIP, None };
+enum class device_type : uint8_t { AUTO, SERIAL, CUDA, HIP, SYCL, NONE };
 
 enum class error_bound_type : uint8_t { REL, ABS };
 enum class norm_type : uint8_t { L_Inf, L_2 };
@@ -55,7 +55,7 @@ enum class domain_decomposition_type : uint8_t { MaxDim, Linearize };
 #include <vector>
 
 // #include "RuntimeX/DataStructures/Array.h"
-#include "Hierarchy.h"
+#include "../Hierarchy/Hierarchy.h"
 // #include "RuntimeX/Messages/Message.h"
 // #include "ErrorCalculator.h"
 // #include "MemoryManagement.h"
diff --git a/src/mgard-x/CMakeLists.txt b/src/mgard-x/CMakeLists.txt
index d712572eeb..b7bca50012 100644
--- a/src/mgard-x/CMakeLists.txt
+++ b/src/mgard-x/CMakeLists.txt
@@ -1,9 +1,9 @@
 add_subdirectory (DataRefactoring)
-add_subdirectory (HighLevelAPI)
+add_subdirectory (CompressionLowLevel)
+add_subdirectory (CompressionHighLevel)
 add_subdirectory (RuntimeX)
-add_subdirectory (CompressionWorkflow)
-add_subdirectory (Utilities)
 set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
 set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
 set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
 set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE)
diff --git a/src/mgard-x/CompressionHighLevel/CMakeLists.txt b/src/mgard-x/CompressionHighLevel/CMakeLists.txt
new file mode 100644
index 0000000000..ef7188197a
--- /dev/null
+++ b/src/mgard-x/CompressionHighLevel/CMakeLists.txt
@@ -0,0 +1,11 @@
+MgardXGenerateSourceAllDevices("Compress")
+MgardXGenerateSourceAllDevices("Decompress")
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
+
+list(APPEND MGARD_X_SRC  
+      ${CMAKE_CURRENT_SOURCE_DIR}/DynamicAPI.cpp)
+
+set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE)
diff --git a/src/mgard-x/CompressionHighLevel/Compress.cpp.in b/src/mgard-x/CompressionHighLevel/Compress.cpp.in
new file mode 100644
index 0000000000..d1d67079bd
--- /dev/null
+++ b/src/mgard-x/CompressionHighLevel/Compress.cpp.in
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/CompressionHighLevel/CompressionHighLevel.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void compress<@DEVICE_TYPE@>(DIM D, data_type dtype,
+                                       std::vector<SIZE> shape, double tol,
+                                       double s, enum error_bound_type mode,
+                                       const void *original_data,
+                                       void *&compressed_data,
+                                       size_t &compressed_size, Config config,
+                                       bool output_pre_allocated);
+
+template void
+compress<@DEVICE_TYPE@>(DIM D, data_type dtype, std::vector<SIZE> shape,
+                         double tol, double s, enum error_bound_type mode,
+                         const void *original_data, void *&compressed_data,
+                         size_t &compressed_size, bool output_pre_allocated);
+
+template void compress<@DEVICE_TYPE@>(
+    DIM D, data_type dtype, std::vector<SIZE> shape, double tol, double s,
+    enum error_bound_type mode, const void *original_data,
+    void *&compressed_data, size_t &compressed_size,
+    std::vector<const Byte *> coords, Config config, bool output_pre_allocated);
+
+template void compress<@DEVICE_TYPE@>(
+    DIM D, data_type dtype, std::vector<SIZE> shape, double tol, double s,
+    enum error_bound_type mode, const void *original_data,
+    void *&compressed_data, size_t &compressed_size,
+    std::vector<const Byte *> coords, bool output_pre_allocated);
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/CompressionHighLevel/Decompress.cpp.in b/src/mgard-x/CompressionHighLevel/Decompress.cpp.in
new file mode 100644
index 0000000000..91e0fafee4
--- /dev/null
+++ b/src/mgard-x/CompressionHighLevel/Decompress.cpp.in
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/CompressionHighLevel/CompressionHighLevel.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void decompress<@DEVICE_TYPE@>(const void *compressed_data,
+                                         size_t compressed_size,
+                                         void *&decompressed_data,
+                                         Config config,
+                                         bool output_pre_allocated);
+
+template void decompress<@DEVICE_TYPE@>(const void *compressed_data,
+                                         size_t compressed_size,
+                                         void *&decompressed_data,
+                                         bool output_pre_allocated);
+
+template void
+decompress<@DEVICE_TYPE@>(const void *compressed_data, size_t compressed_size,
+                           void *&decompressed_data, data_type &dtype,
+                           std::vector<SIZE> &shape, Config config,
+                           bool output_pre_allocated);
+
+template void
+decompress<@DEVICE_TYPE@>(const void *compressed_data, size_t compressed_size,
+                           void *&decompressed_data, data_type &dtype,
+                           std::vector<SIZE> &shape, bool output_pre_allocated);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/HighLevelAPI/DynamicAPI.cpp b/src/mgard-x/CompressionHighLevel/DynamicAPI.cpp
similarity index 67%
rename from src/mgard-x/HighLevelAPI/DynamicAPI.cpp
rename to src/mgard-x/CompressionHighLevel/DynamicAPI.cpp
index 1127b1a6cc..c3b576826a 100644
--- a/src/mgard-x/HighLevelAPI/DynamicAPI.cpp
+++ b/src/mgard-x/CompressionHighLevel/DynamicAPI.cpp
@@ -11,19 +11,18 @@
 #include <numeric>
 #include <vector>
 
-#include "MGARDXConfig.h"
 #include "compress_x.hpp"
-#include "mgard-x/Hierarchy.h"
-#include "mgard-x/HighLevelAPI.h"
-#include "mgard-x/Metadata.hpp"
+#include "mgard-x/CompressionHighLevel/CompressionHighLevel.h"
+#include "mgard-x/CompressionHighLevel/Metadata.hpp"
+#include "mgard-x/Hierarchy/Hierarchy.h"
 #include "mgard-x/RuntimeX/RuntimeXPublic.h"
 
 namespace mgard_x {
 
 enum device_type auto_detect_device() {
-  enum device_type dev_type = device_type::None;
+  enum device_type dev_type = device_type::NONE;
 #if MGARD_ENABLE_SERIAL
-  dev_type = device_type::Serial;
+  dev_type = device_type::SERIAL;
 #endif
 #if MGARD_ENABLE_CUDA
   if (deviceAvailable<CUDA>()) {
@@ -35,7 +34,12 @@ enum device_type auto_detect_device() {
     dev_type = device_type::HIP;
   }
 #endif
-  if (dev_type == device_type::None) {
+#if MGARD_ENABLE_SYCL
+  if (deviceAvailable<SYCL>()) {
+    dev_type = device_type::SYCL;
+  }
+#endif
+  if (dev_type == device_type::NONE) {
     std::cout << log::log_err << "MGARD-X was not built with any backend.\n";
     exit(-1);
   }
@@ -48,17 +52,17 @@ void compress(DIM D, data_type dtype, std::vector<SIZE> shape, double tol,
               bool output_pre_allocated) {
 
   enum device_type dev_type = config.dev_type;
-  if (dev_type == device_type::Auto) {
+  if (dev_type == device_type::AUTO) {
     dev_type = auto_detect_device();
   }
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    compress<Serial>(D, dtype, shape, tol, s, mode, original_data,
+    compress<SERIAL>(D, dtype, shape, tol, s, mode, original_data,
                      compressed_data, compressed_size, config,
                      output_pre_allocated);
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -77,6 +81,15 @@ void compress(DIM D, data_type dtype, std::vector<SIZE> shape, double tol,
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    compress<SYCL>(D, dtype, shape, tol, s, mode, original_data,
+                   compressed_data, compressed_size, config,
+                   output_pre_allocated);
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
@@ -90,12 +103,12 @@ void compress(DIM D, data_type dtype, std::vector<SIZE> shape, double tol,
 
   enum device_type dev_type = auto_detect_device();
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    compress<Serial>(D, dtype, shape, tol, s, mode, original_data,
+    compress<SERIAL>(D, dtype, shape, tol, s, mode, original_data,
                      compressed_data, compressed_size, output_pre_allocated);
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -113,6 +126,14 @@ void compress(DIM D, data_type dtype, std::vector<SIZE> shape, double tol,
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    compress<SYCL>(D, dtype, shape, tol, s, mode, original_data,
+                   compressed_data, compressed_size, output_pre_allocated);
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
@@ -126,17 +147,17 @@ void compress(DIM D, data_type dtype, std::vector<SIZE> shape, double tol,
               bool output_pre_allocated) {
 
   enum device_type dev_type = config.dev_type;
-  if (dev_type == device_type::Auto) {
+  if (dev_type == device_type::AUTO) {
     dev_type = auto_detect_device();
   }
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    compress<Serial>(D, dtype, shape, tol, s, mode, original_data,
+    compress<SERIAL>(D, dtype, shape, tol, s, mode, original_data,
                      compressed_data, compressed_size, coords, config,
                      output_pre_allocated);
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -155,6 +176,15 @@ void compress(DIM D, data_type dtype, std::vector<SIZE> shape, double tol,
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    compress<SYCL>(D, dtype, shape, tol, s, mode, original_data,
+                   compressed_data, compressed_size, coords, config,
+                   output_pre_allocated);
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
@@ -168,13 +198,13 @@ void compress(DIM D, data_type dtype, std::vector<SIZE> shape, double tol,
 
   enum device_type dev_type = auto_detect_device();
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    compress<Serial>(D, dtype, shape, tol, s, mode, original_data,
+    compress<SERIAL>(D, dtype, shape, tol, s, mode, original_data,
                      compressed_data, compressed_size, coords,
                      output_pre_allocated);
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -193,6 +223,15 @@ void compress(DIM D, data_type dtype, std::vector<SIZE> shape, double tol,
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    compress<SYCL>(D, dtype, shape, tol, s, mode, original_data,
+                   compressed_data, compressed_size, coords,
+                   output_pre_allocated);
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
@@ -204,16 +243,16 @@ void decompress(const void *compressed_data, size_t compressed_size,
                 bool output_pre_allocated) {
 
   enum device_type dev_type = config.dev_type;
-  if (dev_type == device_type::Auto) {
+  if (dev_type == device_type::AUTO) {
     dev_type = auto_detect_device();
   }
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    decompress<Serial>(compressed_data, compressed_size, decompressed_data,
+    decompress<SERIAL>(compressed_data, compressed_size, decompressed_data,
                        config, output_pre_allocated);
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -231,6 +270,14 @@ void decompress(const void *compressed_data, size_t compressed_size,
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    decompress<SYCL>(compressed_data, compressed_size, decompressed_data,
+                     config, output_pre_allocated);
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
@@ -242,12 +289,12 @@ void decompress(const void *compressed_data, size_t compressed_size,
 
   enum device_type dev_type = auto_detect_device();
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    decompress<Serial>(compressed_data, compressed_size, decompressed_data,
+    decompress<SERIAL>(compressed_data, compressed_size, decompressed_data,
                        output_pre_allocated);
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -265,6 +312,14 @@ void decompress(const void *compressed_data, size_t compressed_size,
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    decompress<SYCL>(compressed_data, compressed_size, decompressed_data,
+                     output_pre_allocated);
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
@@ -276,16 +331,16 @@ void decompress(const void *compressed_data, size_t compressed_size,
                 data_type &dtype, Config config, bool output_pre_allocated) {
 
   enum device_type dev_type = config.dev_type;
-  if (dev_type == device_type::Auto) {
+  if (dev_type == device_type::AUTO) {
     dev_type = auto_detect_device();
   }
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    decompress<Serial>(compressed_data, compressed_size, decompressed_data,
+    decompress<SERIAL>(compressed_data, compressed_size, decompressed_data,
                        dtype, shape, config, output_pre_allocated);
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -303,6 +358,14 @@ void decompress(const void *compressed_data, size_t compressed_size,
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    decompress<SYCL>(compressed_data, compressed_size, decompressed_data, dtype,
+                     shape, config, output_pre_allocated);
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
@@ -315,12 +378,12 @@ void decompress(const void *compressed_data, size_t compressed_size,
 
   enum device_type dev_type = auto_detect_device();
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    decompress<Serial>(compressed_data, compressed_size, decompressed_data,
+    decompress<SERIAL>(compressed_data, compressed_size, decompressed_data,
                        dtype, shape, output_pre_allocated);
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -338,6 +401,14 @@ void decompress(const void *compressed_data, size_t compressed_size,
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    decompress<SYCL>(compressed_data, compressed_size, decompressed_data, dtype,
+                     shape, output_pre_allocated);
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
@@ -346,15 +417,15 @@ void decompress(const void *compressed_data, size_t compressed_size,
 
 void BeginAutoTuning(enum device_type dev_type) {
 
-  if (dev_type == device_type::Auto) {
+  if (dev_type == device_type::AUTO) {
     dev_type = auto_detect_device();
   }
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    mgard_x::BeginAutoTuning<mgard_x::Serial>();
+    mgard_x::BeginAutoTuning<mgard_x::SERIAL>();
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -370,6 +441,13 @@ void BeginAutoTuning(enum device_type dev_type) {
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    mgard_x::BeginAutoTuning<mgard_x::SYCL>();
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
@@ -378,15 +456,15 @@ void BeginAutoTuning(enum device_type dev_type) {
 
 void EndAutoTuning(enum device_type dev_type) {
 
-  if (dev_type == device_type::Auto) {
+  if (dev_type == device_type::AUTO) {
     dev_type = auto_detect_device();
   }
 
-  if (dev_type == device_type::Serial) {
+  if (dev_type == device_type::SERIAL) {
 #if MGARD_ENABLE_SERIAL
-    mgard_x::EndAutoTuning<mgard_x::Serial>();
+    mgard_x::EndAutoTuning<mgard_x::SERIAL>();
 #else
-    std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n";
+    std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n";
     exit(-1);
 #endif
   } else if (dev_type == device_type::CUDA) {
@@ -402,6 +480,13 @@ void EndAutoTuning(enum device_type dev_type) {
 #else
     std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n";
     exit(-1);
+#endif
+  } else if (dev_type == device_type::SYCL) {
+#if MGARD_ENABLE_SYCL
+    mgard_x::EndAutoTuning<mgard_x::SYCL>();
+#else
+    std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n";
+    exit(-1);
 #endif
   } else {
     std::cout << log::log_err << "Unsupported backend.\n";
diff --git a/src/mgard-x/CompressionLowLevel/CMakeLists.txt b/src/mgard-x/CompressionLowLevel/CMakeLists.txt
new file mode 100644
index 0000000000..c10580c777
--- /dev/null
+++ b/src/mgard-x/CompressionLowLevel/CMakeLists.txt
@@ -0,0 +1,6 @@
+MgardXGenerateSourceAllCombinations("Compress")
+MgardXGenerateSourceAllCombinations("Decompress")
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/CompressionLowLevel/Compress.cpp.in b/src/mgard-x/CompressionLowLevel/Compress.cpp.in
new file mode 100644
index 0000000000..f042976823
--- /dev/null
+++ b/src/mgard-x/CompressionLowLevel/Compress.cpp.in
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/CompressionLowLevel/CompressionLowLevel.hpp"
+// clang-format off
+namespace mgard_x {
+
+template Array<1, unsigned char, @DEVICE_TYPE@>
+compress<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    Array<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &in_array,
+    enum error_bound_type type, @DATA_TYPE@ tol, @DATA_TYPE@ s,
+    @DATA_TYPE@ &norm, Config config);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/CompressionLowLevel/Decompress.cpp.in b/src/mgard-x/CompressionLowLevel/Decompress.cpp.in
new file mode 100644
index 0000000000..49a7bb4f0d
--- /dev/null
+++ b/src/mgard-x/CompressionLowLevel/Decompress.cpp.in
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/CompressionLowLevel/CompressionLowLevel.hpp"
+// clang-format off
+namespace mgard_x {
+
+template Array<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>
+decompress<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    Array<1, unsigned char, @DEVICE_TYPE@> &compressed_array,
+    enum error_bound_type type, @DATA_TYPE@ tol, @DATA_TYPE@ s,
+    @DATA_TYPE@ norm, Config config);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/CompressionWorkflow/CMakeLists.txt b/src/mgard-x/CompressionWorkflow/CMakeLists.txt
deleted file mode 100644
index 82df5a06d0..0000000000
--- a/src/mgard-x/CompressionWorkflow/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-if(MGARD_ENABLE_SERIAL)
-  add_subdirectory (Serial)
-  set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
-endif()
-if(MGARD_ENABLE_CUDA)
-  add_subdirectory (CUDA)
-  set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
-endif()
-if(MGARD_ENABLE_HIP)
-  add_subdirectory (HIP)
-  set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
-endif()
\ No newline at end of file
diff --git a/src/mgard-x/CompressionWorkflow/CUDA/CMakeLists.txt b/src/mgard-x/CompressionWorkflow/CUDA/CMakeLists.txt
deleted file mode 100644
index 3ea070d9a5..0000000000
--- a/src/mgard-x/CompressionWorkflow/CUDA/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-list(APPEND MGARD_X_CUDA_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.cu)
-
-set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/CompressionWorkflow/CUDA/CompressionWorkflow.cu b/src/mgard-x/CompressionWorkflow/CUDA/CompressionWorkflow.cu
deleted file mode 100644
index 16163a1813..0000000000
--- a/src/mgard-x/CompressionWorkflow/CUDA/CompressionWorkflow.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/CompressionWorkflow.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-#define KERNELS(D, T)                                                          \
-  template Array<1, unsigned char, CUDA> compress<D, T, CUDA>(                 \
-      Hierarchy<D, T, CUDA> & hierarchy, Array<D, T, CUDA> & in_array,         \
-      enum error_bound_type type, T tol, T s, T & norm, Config config);        \
-  template Array<D, T, CUDA> decompress<D, T, CUDA>(                           \
-      Hierarchy<D, T, CUDA> & hierarchy,                                       \
-      Array<1, unsigned char, CUDA> & compressed_array,                        \
-      enum error_bound_type type, T tol, T s, T norm, Config config);
-
-KERNELS(1, double)
-KERNELS(1, float)
-KERNELS(2, double)
-KERNELS(2, float)
-KERNELS(3, double)
-KERNELS(3, float)
-KERNELS(4, double)
-KERNELS(4, float)
-KERNELS(5, double)
-KERNELS(5, float)
-#undef KERNELS
-
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/CompressionWorkflow/HIP/CMakeLists.txt b/src/mgard-x/CompressionWorkflow/HIP/CMakeLists.txt
deleted file mode 100644
index 4360eb9402..0000000000
--- a/src/mgard-x/CompressionWorkflow/HIP/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-list(APPEND MGARD_X_HIP_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.cpp)
-
-set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/CompressionWorkflow/HIP/CompressionWorkflow.cpp b/src/mgard-x/CompressionWorkflow/HIP/CompressionWorkflow.cpp
deleted file mode 100644
index f2df577737..0000000000
--- a/src/mgard-x/CompressionWorkflow/HIP/CompressionWorkflow.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/CompressionWorkflow.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-#define KERNELS(D, T)                                                          \
-  template Array<1, unsigned char, HIP> compress<D, T, HIP>(                   \
-      Hierarchy<D, T, HIP> & hierarchy, Array<D, T, HIP> & in_array,           \
-      enum error_bound_type type, T tol, T s, T & norm, Config config);        \
-  template Array<D, T, HIP> decompress<D, T, HIP>(                             \
-      Hierarchy<D, T, HIP> & hierarchy,                                        \
-      Array<1, unsigned char, HIP> & compressed_array,                         \
-      enum error_bound_type type, T tol, T s, T norm, Config config);
-
-KERNELS(1, double)
-KERNELS(1, float)
-KERNELS(2, double)
-KERNELS(2, float)
-KERNELS(3, double)
-KERNELS(3, float)
-KERNELS(4, double)
-KERNELS(4, float)
-KERNELS(5, double)
-KERNELS(5, float)
-#undef KERNELS
-
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/CompressionWorkflow/Serial/CMakeLists.txt b/src/mgard-x/CompressionWorkflow/Serial/CMakeLists.txt
deleted file mode 100644
index a8196d79a3..0000000000
--- a/src/mgard-x/CompressionWorkflow/Serial/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-list(APPEND MGARD_X_SERIAL_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.cpp)
-
-set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/CompressionWorkflow/Serial/CompressionWorkflow.cpp b/src/mgard-x/CompressionWorkflow/Serial/CompressionWorkflow.cpp
deleted file mode 100644
index a745792615..0000000000
--- a/src/mgard-x/CompressionWorkflow/Serial/CompressionWorkflow.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/CompressionWorkflow.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-#define KERNELS(D, T)                                                          \
-  template Array<1, unsigned char, Serial> compress<D, T, Serial>(             \
-      Hierarchy<D, T, Serial> & hierarchy, Array<D, T, Serial> & in_array,     \
-      enum error_bound_type type, T tol, T s, T & norm, Config config);        \
-  template Array<D, T, Serial> decompress<D, T, Serial>(                       \
-      Hierarchy<D, T, Serial> & hierarchy,                                     \
-      Array<1, unsigned char, Serial> & compressed_array,                      \
-      enum error_bound_type type, T tol, T s, T norm, Config config);
-
-KERNELS(1, double)
-KERNELS(1, float)
-KERNELS(2, double)
-KERNELS(2, float)
-KERNELS(3, double)
-KERNELS(3, float)
-KERNELS(4, double)
-KERNELS(4, float)
-KERNELS(5, double)
-KERNELS(5, float)
-#undef KERNELS
-
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CMakeLists.txt b/src/mgard-x/DataRefactoring/CMakeLists.txt
index 82df5a06d0..21b0652e7f 100644
--- a/src/mgard-x/DataRefactoring/CMakeLists.txt
+++ b/src/mgard-x/DataRefactoring/CMakeLists.txt
@@ -1,12 +1,6 @@
-if(MGARD_ENABLE_SERIAL)
-  add_subdirectory (Serial)
-  set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
-endif()
-if(MGARD_ENABLE_CUDA)
-  add_subdirectory (CUDA)
-  set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
-endif()
-if(MGARD_ENABLE_HIP)
-  add_subdirectory (HIP)
-  set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
-endif()
\ No newline at end of file
+add_subdirectory (MultiDimension)
+add_subdirectory (SingleDimension)
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/CMakeLists.txt b/src/mgard-x/DataRefactoring/CUDA/CMakeLists.txt
deleted file mode 100644
index 62c32de19d..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-list(APPEND MGARD_X_CUDA_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Float.cu  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Double.cu
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Float.cu    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Double.cu  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Float.cu  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Double.cu
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Float.cu    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Double.cu 
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Float.cu  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Double.cu
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Float.cu    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Double.cu   
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Float.cu  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Double.cu
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Float.cu    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Double.cu  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Float.cu  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Double.cu
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Float.cu    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Double.cu
-      )
-
-set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Double.cu
deleted file mode 100644
index e7befd552d..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<1, double, CUDA>(Hierarchy<1, double, CUDA> &hierarchy,
-                                         SubArray<1, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    decompose_single<1, double, CUDA>(Hierarchy<1, double, CUDA> &hierarchy,
-                                      SubArray<1, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Float.cu
deleted file mode 100644
index b8a014ae48..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<1, float, CUDA>(Hierarchy<1, float, CUDA> &hierarchy,
-                                        SubArray<1, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<1, float, CUDA>(Hierarchy<1, float, CUDA> &hierarchy,
-                                     SubArray<1, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Double.cu
deleted file mode 100644
index 27fe8c14f3..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<2, double, CUDA>(Hierarchy<2, double, CUDA> &hierarchy,
-                                         SubArray<2, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    decompose_single<2, double, CUDA>(Hierarchy<2, double, CUDA> &hierarchy,
-                                      SubArray<2, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Float.cu
deleted file mode 100644
index dca55294d8..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<2, float, CUDA>(Hierarchy<2, float, CUDA> &hierarchy,
-                                        SubArray<2, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<2, float, CUDA>(Hierarchy<2, float, CUDA> &hierarchy,
-                                     SubArray<2, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Double.cu
deleted file mode 100644
index 8696ff6cd0..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<3, double, CUDA>(Hierarchy<3, double, CUDA> &hierarchy,
-                                         SubArray<3, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    decompose_single<3, double, CUDA>(Hierarchy<3, double, CUDA> &hierarchy,
-                                      SubArray<3, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Float.cu
deleted file mode 100644
index d5b9678afe..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<3, float, CUDA>(Hierarchy<3, float, CUDA> &hierarchy,
-                                        SubArray<3, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<3, float, CUDA>(Hierarchy<3, float, CUDA> &hierarchy,
-                                     SubArray<3, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Double.cu
deleted file mode 100644
index b60c26ee29..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<4, double, CUDA>(Hierarchy<4, double, CUDA> &hierarchy,
-                                         SubArray<4, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    decompose_single<4, double, CUDA>(Hierarchy<4, double, CUDA> &hierarchy,
-                                      SubArray<4, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Float.cu
deleted file mode 100644
index b445c109bf..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<4, float, CUDA>(Hierarchy<4, float, CUDA> &hierarchy,
-                                        SubArray<4, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<4, float, CUDA>(Hierarchy<4, float, CUDA> &hierarchy,
-                                     SubArray<4, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Double.cu
deleted file mode 100644
index 1c21fec0a9..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<5, double, CUDA>(Hierarchy<5, double, CUDA> &hierarchy,
-                                         SubArray<5, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    decompose_single<5, double, CUDA>(Hierarchy<5, double, CUDA> &hierarchy,
-                                      SubArray<5, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Float.cu
deleted file mode 100644
index 75c14f6bc4..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<5, float, CUDA>(Hierarchy<5, float, CUDA> &hierarchy,
-                                        SubArray<5, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<5, float, CUDA>(Hierarchy<5, float, CUDA> &hierarchy,
-                                     SubArray<5, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Double.cu
deleted file mode 100644
index 7dbfdf79d0..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<1, double, CUDA>(Hierarchy<1, double, CUDA> &hierarchy,
-                                         SubArray<1, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    recompose_single<1, double, CUDA>(Hierarchy<1, double, CUDA> &hierarchy,
-                                      SubArray<1, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Float.cu
deleted file mode 100644
index df0f4f84e5..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<1, float, CUDA>(Hierarchy<1, float, CUDA> &hierarchy,
-                                        SubArray<1, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<1, float, CUDA>(Hierarchy<1, float, CUDA> &hierarchy,
-                                     SubArray<1, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Double.cu
deleted file mode 100644
index ac6da649f8..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<2, double, CUDA>(Hierarchy<2, double, CUDA> &hierarchy,
-                                         SubArray<2, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    recompose_single<2, double, CUDA>(Hierarchy<2, double, CUDA> &hierarchy,
-                                      SubArray<2, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Float.cu
deleted file mode 100644
index 8c9f4297a4..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<2, float, CUDA>(Hierarchy<2, float, CUDA> &hierarchy,
-                                        SubArray<2, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<2, float, CUDA>(Hierarchy<2, float, CUDA> &hierarchy,
-                                     SubArray<2, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Double.cu
deleted file mode 100644
index 4dd374167e..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<3, double, CUDA>(Hierarchy<3, double, CUDA> &hierarchy,
-                                         SubArray<3, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    recompose_single<3, double, CUDA>(Hierarchy<3, double, CUDA> &hierarchy,
-                                      SubArray<3, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Float.cu
deleted file mode 100644
index 9834cd15f8..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<3, float, CUDA>(Hierarchy<3, float, CUDA> &hierarchy,
-                                        SubArray<3, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<3, float, CUDA>(Hierarchy<3, float, CUDA> &hierarchy,
-                                     SubArray<3, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Double.cu
deleted file mode 100644
index 242604bb42..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<4, double, CUDA>(Hierarchy<4, double, CUDA> &hierarchy,
-                                         SubArray<4, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    recompose_single<4, double, CUDA>(Hierarchy<4, double, CUDA> &hierarchy,
-                                      SubArray<4, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Float.cu
deleted file mode 100644
index 5747fe88b9..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<4, float, CUDA>(Hierarchy<4, float, CUDA> &hierarchy,
-                                        SubArray<4, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<4, float, CUDA>(Hierarchy<4, float, CUDA> &hierarchy,
-                                     SubArray<4, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Double.cu
deleted file mode 100644
index af972fd18c..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Double.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<5, double, CUDA>(Hierarchy<5, double, CUDA> &hierarchy,
-                                         SubArray<5, double, CUDA> &v,
-                                         SIZE l_target, int queue_idx);
-template void
-    recompose_single<5, double, CUDA>(Hierarchy<5, double, CUDA> &hierarchy,
-                                      SubArray<5, double, CUDA> &v,
-                                      SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Float.cu
deleted file mode 100644
index 39bbf881fd..0000000000
--- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Float.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<5, float, CUDA>(Hierarchy<5, float, CUDA> &hierarchy,
-                                        SubArray<5, float, CUDA> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<5, float, CUDA>(Hierarchy<5, float, CUDA> &hierarchy,
-                                     SubArray<5, float, CUDA> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/CMakeLists.txt b/src/mgard-x/DataRefactoring/HIP/CMakeLists.txt
deleted file mode 100644
index 1a7736f707..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-list(APPEND MGARD_X_HIP_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Double.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Double.cpp 
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Double.cpp   
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Double.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Double.cpp
-      )
-
-set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Double.cpp
deleted file mode 100644
index 2bd1056f70..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<1, double, HIP>(Hierarchy<1, double, HIP> &hierarchy,
-                                        SubArray<1, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<1, double, HIP>(Hierarchy<1, double, HIP> &hierarchy,
-                                     SubArray<1, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Float.cpp
deleted file mode 100644
index 4c9e6abecf..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<1, float, HIP>(Hierarchy<1, float, HIP> &hierarchy,
-                                       SubArray<1, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    decompose_single<1, float, HIP>(Hierarchy<1, float, HIP> &hierarchy,
-                                    SubArray<1, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Double.cpp
deleted file mode 100644
index 5ae8cdbefc..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<2, double, HIP>(Hierarchy<2, double, HIP> &hierarchy,
-                                        SubArray<2, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<2, double, HIP>(Hierarchy<2, double, HIP> &hierarchy,
-                                     SubArray<2, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Float.cpp
deleted file mode 100644
index c7d7166065..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<2, float, HIP>(Hierarchy<2, float, HIP> &hierarchy,
-                                       SubArray<2, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    decompose_single<2, float, HIP>(Hierarchy<2, float, HIP> &hierarchy,
-                                    SubArray<2, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Double.cpp
deleted file mode 100644
index 9f5b06257c..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<3, double, HIP>(Hierarchy<3, double, HIP> &hierarchy,
-                                        SubArray<3, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<3, double, HIP>(Hierarchy<3, double, HIP> &hierarchy,
-                                     SubArray<3, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Float.cpp
deleted file mode 100644
index ab710b8056..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<3, float, HIP>(Hierarchy<3, float, HIP> &hierarchy,
-                                       SubArray<3, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    decompose_single<3, float, HIP>(Hierarchy<3, float, HIP> &hierarchy,
-                                    SubArray<3, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Double.cpp
deleted file mode 100644
index 7d8c6d1cb1..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<4, double, HIP>(Hierarchy<4, double, HIP> &hierarchy,
-                                        SubArray<4, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<4, double, HIP>(Hierarchy<4, double, HIP> &hierarchy,
-                                     SubArray<4, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Float.cpp
deleted file mode 100644
index b58becf5e4..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<4, float, HIP>(Hierarchy<4, float, HIP> &hierarchy,
-                                       SubArray<4, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    decompose_single<4, float, HIP>(Hierarchy<4, float, HIP> &hierarchy,
-                                    SubArray<4, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Double.cpp
deleted file mode 100644
index 9f70b93bb0..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<5, double, HIP>(Hierarchy<5, double, HIP> &hierarchy,
-                                        SubArray<5, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    decompose_single<5, double, HIP>(Hierarchy<5, double, HIP> &hierarchy,
-                                     SubArray<5, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Float.cpp
deleted file mode 100644
index 0d8afc5972..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void decompose<5, float, HIP>(Hierarchy<5, float, HIP> &hierarchy,
-                                       SubArray<5, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    decompose_single<5, float, HIP>(Hierarchy<5, float, HIP> &hierarchy,
-                                    SubArray<5, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Double.cpp
deleted file mode 100644
index 412e1291f7..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<1, double, HIP>(Hierarchy<1, double, HIP> &hierarchy,
-                                        SubArray<1, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<1, double, HIP>(Hierarchy<1, double, HIP> &hierarchy,
-                                     SubArray<1, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Float.cpp
deleted file mode 100644
index 0421843007..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<1, float, HIP>(Hierarchy<1, float, HIP> &hierarchy,
-                                       SubArray<1, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    recompose_single<1, float, HIP>(Hierarchy<1, float, HIP> &hierarchy,
-                                    SubArray<1, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Double.cpp
deleted file mode 100644
index bfc2bfb572..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<2, double, HIP>(Hierarchy<2, double, HIP> &hierarchy,
-                                        SubArray<2, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<2, double, HIP>(Hierarchy<2, double, HIP> &hierarchy,
-                                     SubArray<2, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Float.cpp
deleted file mode 100644
index 984b5ada23..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<2, float, HIP>(Hierarchy<2, float, HIP> &hierarchy,
-                                       SubArray<2, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    recompose_single<2, float, HIP>(Hierarchy<2, float, HIP> &hierarchy,
-                                    SubArray<2, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Double.cpp
deleted file mode 100644
index 65433c2022..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<3, double, HIP>(Hierarchy<3, double, HIP> &hierarchy,
-                                        SubArray<3, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<3, double, HIP>(Hierarchy<3, double, HIP> &hierarchy,
-                                     SubArray<3, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Float.cpp
deleted file mode 100644
index f9de58d439..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<3, float, HIP>(Hierarchy<3, float, HIP> &hierarchy,
-                                       SubArray<3, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    recompose_single<3, float, HIP>(Hierarchy<3, float, HIP> &hierarchy,
-                                    SubArray<3, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Double.cpp
deleted file mode 100644
index 165fe8af5b..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<4, double, HIP>(Hierarchy<4, double, HIP> &hierarchy,
-                                        SubArray<4, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<4, double, HIP>(Hierarchy<4, double, HIP> &hierarchy,
-                                     SubArray<4, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Float.cpp
deleted file mode 100644
index 4d9e0cab1d..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<4, float, HIP>(Hierarchy<4, float, HIP> &hierarchy,
-                                       SubArray<4, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    recompose_single<4, float, HIP>(Hierarchy<4, float, HIP> &hierarchy,
-                                    SubArray<4, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Double.cpp
deleted file mode 100644
index 2f9ae8423a..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Double.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<5, double, HIP>(Hierarchy<5, double, HIP> &hierarchy,
-                                        SubArray<5, double, HIP> &v,
-                                        SIZE l_target, int queue_idx);
-template void
-    recompose_single<5, double, HIP>(Hierarchy<5, double, HIP> &hierarchy,
-                                     SubArray<5, double, HIP> &v, SIZE l_target,
-                                     int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Float.cpp
deleted file mode 100644
index 73ed87b844..0000000000
--- a/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void recompose<5, float, HIP>(Hierarchy<5, float, HIP> &hierarchy,
-                                       SubArray<5, float, HIP> &v,
-                                       SIZE l_target, int queue_idx);
-template void
-    recompose_single<5, float, HIP>(Hierarchy<5, float, HIP> &hierarchy,
-                                    SubArray<5, float, HIP> &v, SIZE l_target,
-                                    int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CMakeLists.txt b/src/mgard-x/DataRefactoring/MultiDimension/CMakeLists.txt
new file mode 100644
index 0000000000..cb89cb471d
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_subdirectory (Coefficient)
+add_subdirectory (Correction)
+add_subdirectory (CopyND)
+MgardXGenerateSourceAllCombinations("Decompose")
+MgardXGenerateSourceAllCombinations("Recompose")
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt
new file mode 100644
index 0000000000..c6bdc5530b
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt
@@ -0,0 +1,8 @@
+MgardXGenerateSourceAllCombinations("CalcCoefficients3D")
+MgardXGenerateSourceAllCombinations("CoefficientsRestore3D")
+MgardXGenerateSourceAllCombinations("CalcCoefficientsND")
+MgardXGenerateSourceAllCombinations("CoefficientsRestoreND")
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.cpp.in
new file mode 100644
index 0000000000..b45a2c4075
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.cpp.in
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CalcCoefficients3D<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, SIZE l,
+    int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.cpp.in
new file mode 100644
index 0000000000..d183581175
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.cpp.in
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CalcCoefficientsND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput1,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput2,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, SIZE l,
+    int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.cpp.in
new file mode 100644
index 0000000000..c7fec64b5e
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.cpp.in
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CoefficientsRestore3D<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, SIZE l,
+    int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.cpp.in
new file mode 100644
index 0000000000..6d6da0248e
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.cpp.in
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CoefficientsRestoreND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput1,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput2,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, SIZE l,
+    int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.cpp.in
new file mode 100644
index 0000000000..642c34f3bb
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.cpp.in
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void AddND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CMakeLists.txt b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CMakeLists.txt
new file mode 100644
index 0000000000..305759b565
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CMakeLists.txt
@@ -0,0 +1,7 @@
+MgardXGenerateSourceAllCombinations("CopyND")
+MgardXGenerateSourceAllCombinations("AddND")
+MgardXGenerateSourceAllCombinations("SubtractND")
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.cpp.in
new file mode 100644
index 0000000000..1050fdd6ee
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.cpp.in
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CopyND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.cpp.in
new file mode 100644
index 0000000000..b63b081af3
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.cpp.in
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void SubtractND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Correction/CMakeLists.txt b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CMakeLists.txt
new file mode 100644
index 0000000000..9c9fe8f704
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CMakeLists.txt
@@ -0,0 +1,6 @@
+MgardXGenerateSourceAllCombinations("CalcCorrection3D")
+MgardXGenerateSourceAllCombinations("CalcCorrectionND")
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.cpp.in
new file mode 100644
index 0000000000..ecf79d9b50
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.cpp.in
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CalcCorrection3D<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dcoeff,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &dcorrection, SIZE l,
+    int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.cpp.in
new file mode 100644
index 0000000000..a939ce52e2
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.cpp.in
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CalcCorrectionND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dcoeff,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &dcorrection, SIZE l,
+    int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Decompose.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Decompose.cpp.in
new file mode 100644
index 0000000000..39fedf66cd
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Decompose.cpp.in
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void decompose<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &v, SIZE l_target,
+    int queue_idx);
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Recompose.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Recompose.cpp.in
new file mode 100644
index 0000000000..8f476aaac4
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/MultiDimension/Recompose.cpp.in
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void recompose<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &v, SIZE l_target,
+    int queue_idx);
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/CMakeLists.txt b/src/mgard-x/DataRefactoring/Serial/CMakeLists.txt
deleted file mode 100644
index cd1ed0e623..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-list(APPEND MGARD_X_SERIAL_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Double.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Double.cpp 
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Double.cpp   
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Double.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Float.cpp  
-      ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Double.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Float.cpp    
-      ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Double.cpp
-      )
-
-set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Double.cpp
deleted file mode 100644
index 6e21991762..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Double.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<1, double, Serial>(Hierarchy<1, double, Serial> &hierarchy,
-                                 SubArray<1, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    decompose_single<1, double, Serial>(Hierarchy<1, double, Serial> &hierarchy,
-                                        SubArray<1, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Float.cpp
deleted file mode 100644
index 28a7d34959..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<1, float, Serial>(Hierarchy<1, float, Serial> &hierarchy,
-                                SubArray<1, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    decompose_single<1, float, Serial>(Hierarchy<1, float, Serial> &hierarchy,
-                                       SubArray<1, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Double.cpp
deleted file mode 100644
index 4885cac659..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Double.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<2, double, Serial>(Hierarchy<2, double, Serial> &hierarchy,
-                                 SubArray<2, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    decompose_single<2, double, Serial>(Hierarchy<2, double, Serial> &hierarchy,
-                                        SubArray<2, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Float.cpp
deleted file mode 100644
index 1477d1bc23..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<2, float, Serial>(Hierarchy<2, float, Serial> &hierarchy,
-                                SubArray<2, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    decompose_single<2, float, Serial>(Hierarchy<2, float, Serial> &hierarchy,
-                                       SubArray<2, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Double.cpp
deleted file mode 100644
index 49de330b98..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Double.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<3, double, Serial>(Hierarchy<3, double, Serial> &hierarchy,
-                                 SubArray<3, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    decompose_single<3, double, Serial>(Hierarchy<3, double, Serial> &hierarchy,
-                                        SubArray<3, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Float.cpp
deleted file mode 100644
index 41a35a8a1f..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<3, float, Serial>(Hierarchy<3, float, Serial> &hierarchy,
-                                SubArray<3, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    decompose_single<3, float, Serial>(Hierarchy<3, float, Serial> &hierarchy,
-                                       SubArray<3, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Double.cpp
deleted file mode 100644
index f59f81a53e..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Double.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<4, double, Serial>(Hierarchy<4, double, Serial> &hierarchy,
-                                 SubArray<4, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    decompose_single<4, double, Serial>(Hierarchy<4, double, Serial> &hierarchy,
-                                        SubArray<4, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Float.cpp
deleted file mode 100644
index b0bf406e37..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<4, float, Serial>(Hierarchy<4, float, Serial> &hierarchy,
-                                SubArray<4, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    decompose_single<4, float, Serial>(Hierarchy<4, float, Serial> &hierarchy,
-                                       SubArray<4, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Double.cpp
deleted file mode 100644
index 439d1e9a02..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Double.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<5, double, Serial>(Hierarchy<5, double, Serial> &hierarchy,
-                                 SubArray<5, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    decompose_single<5, double, Serial>(Hierarchy<5, double, Serial> &hierarchy,
-                                        SubArray<5, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Float.cpp
deleted file mode 100644
index d4ccdfd8e9..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    decompose<5, float, Serial>(Hierarchy<5, float, Serial> &hierarchy,
-                                SubArray<5, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    decompose_single<5, float, Serial>(Hierarchy<5, float, Serial> &hierarchy,
-                                       SubArray<5, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Double.cpp
deleted file mode 100644
index 4a71227ea1..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Double.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<1, double, Serial>(Hierarchy<1, double, Serial> &hierarchy,
-                                 SubArray<1, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    recompose_single<1, double, Serial>(Hierarchy<1, double, Serial> &hierarchy,
-                                        SubArray<1, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Float.cpp
deleted file mode 100644
index 4ccd960f83..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<1, float, Serial>(Hierarchy<1, float, Serial> &hierarchy,
-                                SubArray<1, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    recompose_single<1, float, Serial>(Hierarchy<1, float, Serial> &hierarchy,
-                                       SubArray<1, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Double.cpp
deleted file mode 100644
index 16365c6a1d..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Double.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<2, double, Serial>(Hierarchy<2, double, Serial> &hierarchy,
-                                 SubArray<2, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    recompose_single<2, double, Serial>(Hierarchy<2, double, Serial> &hierarchy,
-                                        SubArray<2, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Float.cpp
deleted file mode 100644
index 7f1257c497..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<2, float, Serial>(Hierarchy<2, float, Serial> &hierarchy,
-                                SubArray<2, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    recompose_single<2, float, Serial>(Hierarchy<2, float, Serial> &hierarchy,
-                                       SubArray<2, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Double.cpp
deleted file mode 100644
index 9af949eda2..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Double.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<3, double, Serial>(Hierarchy<3, double, Serial> &hierarchy,
-                                 SubArray<3, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    recompose_single<3, double, Serial>(Hierarchy<3, double, Serial> &hierarchy,
-                                        SubArray<3, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Float.cpp
deleted file mode 100644
index 039087a200..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<3, float, Serial>(Hierarchy<3, float, Serial> &hierarchy,
-                                SubArray<3, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    recompose_single<3, float, Serial>(Hierarchy<3, float, Serial> &hierarchy,
-                                       SubArray<3, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Double.cpp
deleted file mode 100644
index 10bd8c9807..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Double.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<4, double, Serial>(Hierarchy<4, double, Serial> &hierarchy,
-                                 SubArray<4, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    recompose_single<4, double, Serial>(Hierarchy<4, double, Serial> &hierarchy,
-                                        SubArray<4, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Float.cpp
deleted file mode 100644
index 70e147dfdc..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<4, float, Serial>(Hierarchy<4, float, Serial> &hierarchy,
-                                SubArray<4, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    recompose_single<4, float, Serial>(Hierarchy<4, float, Serial> &hierarchy,
-                                       SubArray<4, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Double.cpp
deleted file mode 100644
index 751d51c27c..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Double.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<5, double, Serial>(Hierarchy<5, double, Serial> &hierarchy,
-                                 SubArray<5, double, Serial> &v, SIZE l_target,
-                                 int queue_idx);
-template void
-    recompose_single<5, double, Serial>(Hierarchy<5, double, Serial> &hierarchy,
-                                        SubArray<5, double, Serial> &v,
-                                        SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Float.cpp
deleted file mode 100644
index 513e797869..0000000000
--- a/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Float.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp"
-#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void
-    recompose<5, float, Serial>(Hierarchy<5, float, Serial> &hierarchy,
-                                SubArray<5, float, Serial> &v, SIZE l_target,
-                                int queue_idx);
-template void
-    recompose_single<5, float, Serial>(Hierarchy<5, float, Serial> &hierarchy,
-                                       SubArray<5, float, Serial> &v,
-                                       SIZE l_target, int queue_idx);
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/SingleDimension/CMakeLists.txt b/src/mgard-x/DataRefactoring/SingleDimension/CMakeLists.txt
new file mode 100644
index 0000000000..258dd0b00f
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/SingleDimension/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_subdirectory(Coefficient)
+add_subdirectory(Correction)
+MgardXGenerateSourceAllCombinations("Decompose")
+MgardXGenerateSourceAllCombinations("Recompose")
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CMakeLists.txt b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CMakeLists.txt
new file mode 100644
index 0000000000..7ca1850c9f
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CMakeLists.txt
@@ -0,0 +1,6 @@
+MgardXGenerateSourceAllCombinations("CalcCoefficients")
+MgardXGenerateSourceAllCombinations("CoefficientsRestore")
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.cpp.in
new file mode 100644
index 0000000000..a14844d10b
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.cpp.in
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CalcCoefficients<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    DIM current_dim, SubArray<1, @DATA_TYPE@, @DEVICE_TYPE@> ratio,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> v,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> coarse,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> coeff, int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.cpp.in
new file mode 100644
index 0000000000..b2f2de4a06
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.cpp.in
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CoefficientsRestore<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    DIM current_dim, SubArray<1, @DATA_TYPE@, @DEVICE_TYPE@> ratio,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> v,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> coarse,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> coeff, int queue_idx);
+
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Correction/CMakeLists.txt b/src/mgard-x/DataRefactoring/SingleDimension/Correction/CMakeLists.txt
new file mode 100644
index 0000000000..c80531852e
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/SingleDimension/Correction/CMakeLists.txt
@@ -0,0 +1,5 @@
+MgardXGenerateSourceAllCombinations("CalcCorrection")
+set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
+set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
+set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.cpp.in
new file mode 100644
index 0000000000..f357e86c95
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.cpp.in
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void CalcCorrection<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &coeff,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &correction,
+    SIZE curr_dim, SIZE l, int queue_idx);
+
+} // namespace mgard_x
+// clang-format off
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Decompose.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Decompose.cpp.in
new file mode 100644
index 0000000000..a40de0f84c
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/SingleDimension/Decompose.cpp.in
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void decompose_single<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &v, SIZE l_target,
+    int queue_idx);
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Recompose.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Recompose.cpp.in
new file mode 100644
index 0000000000..891956a814
--- /dev/null
+++ b/src/mgard-x/DataRefactoring/SingleDimension/Recompose.cpp.in
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp"
+// clang-format off
+namespace mgard_x {
+
+template void recompose_single<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>(
+    Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy,
+    SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &v, SIZE l_target,
+    int queue_idx);
+} // namespace mgard_x
+// clang-format on
\ No newline at end of file
diff --git a/src/mgard-x/Executables/mgard-x-autotuner.cpp b/src/mgard-x/Executables/mgard-x-autotuner.cpp
index 104c124dc7..934f094e6f 100644
--- a/src/mgard-x/Executables/mgard-x-autotuner.cpp
+++ b/src/mgard-x/Executables/mgard-x-autotuner.cpp
@@ -68,13 +68,15 @@ int launch_compress(mgard_x::DIM D, enum mgard_x::data_type dtype,
 
 void autotuning(enum mgard_x::device_type dev_type,
                 std::vector<mgard_x::SIZE> shape) {
-  if (dev_type == mgard_x::device_type::Serial) {
+  if (dev_type == mgard_x::device_type::SERIAL) {
     std::cout << mgard_x::log::log_info
-              << "Start autotuning MGARD-X::Serial.\n";
+              << "Start autotuning MGARD-X::SERIAL.\n";
   } else if (dev_type == mgard_x::device_type::CUDA) {
     std::cout << mgard_x::log::log_info << "Start auto tuning MGARD-X::CUDA.\n";
   } else if (dev_type == mgard_x::device_type::HIP) {
     std::cout << mgard_x::log::log_info << "Start auto tuning MGARD-X::HIP.\n";
+  } else if (dev_type == mgard_x::device_type::SYCL) {
+    std::cout << mgard_x::log::log_info << "Start auto tuning MGARD-X::SYCL.\n";
   }
   mgard_x::BeginAutoTuning(dev_type);
   std::cout << mgard_x::log::log_info
@@ -88,13 +90,15 @@ void autotuning(enum mgard_x::device_type dev_type,
                           dev_type);
   std::cout << "Done.\n";
   mgard_x::EndAutoTuning(dev_type);
-  if (dev_type == mgard_x::device_type::Serial) {
+  if (dev_type == mgard_x::device_type::SERIAL) {
     std::cout << mgard_x::log::log_info
-              << "Done auto tuning MGARD-X::Serial.\n";
+              << "Done auto tuning MGARD-X::SERIAL.\n";
   } else if (dev_type == mgard_x::device_type::CUDA) {
     std::cout << mgard_x::log::log_info << "Done auto tuning MGARD-X::CUDA.\n";
   } else if (dev_type == mgard_x::device_type::HIP) {
     std::cout << mgard_x::log::log_info << "Done auto tuning MGARD-X::HIP.\n";
+  } else if (dev_type == mgard_x::device_type::SYCL) {
+    std::cout << mgard_x::log::log_info << "Done auto tuning MGARD-X::SYCL.\n";
   }
   std::cout << mgard_x::log::log_info
             << "Please recompile MGARD-X to make the auto tuning effective.\n";
@@ -105,7 +109,7 @@ void print_usage_message(std::string error) {
     std::cout << mgard_x::log::log_err << error << std::endl;
   }
   printf("* Full automatic mode: run 'mgard-x-autotuner' without arguments\n\
-* For a specific backend: run 'mgard-x-autotuner -d <auto|serial|cuda|hip> '\n\
+* For a specific backend: run 'mgard-x-autotuner -d <auto|serial|cuda|hip|sycl> '\n\
 * For a specific input size on a specific backend: run 'mgard-x-autotuner -d <auto|serial|cuda|hip> -n <ndim> [dim1] [dim2] ... [dimN]'\n");
   exit(0);
 }
@@ -186,14 +190,17 @@ int main(int argc, char *argv[]) {
     std::cout << "\n";
     std::string dev = get_arg(argc, argv, "-d");
     if (dev.compare("serial") == 0) {
-      dev_type = mgard_x::device_type::Serial;
-      std::cout << mgard_x::log::log_info << "device type: Serial\n";
+      dev_type = mgard_x::device_type::SERIAL;
+      std::cout << mgard_x::log::log_info << "device type: SERIAL\n";
     } else if (dev.compare("cuda") == 0) {
       dev_type = mgard_x::device_type::CUDA;
       std::cout << mgard_x::log::log_info << "device type: CUDA\n";
     } else if (dev.compare("hip") == 0) {
       dev_type = mgard_x::device_type::HIP;
       std::cout << mgard_x::log::log_info << "device type: HIP\n";
+    } else if (dev.compare("sycl") == 0) {
+      dev_type = mgard_x::device_type::SYCL;
+      std::cout << mgard_x::log::log_info << "device type: SYCL\n";
     } else {
       std::cout << "wrong device type.\n";
       exit(-1);
@@ -203,14 +210,17 @@ int main(int argc, char *argv[]) {
     std::vector<mgard_x::SIZE> shape({513, 513, 513});
     std::string dev = get_arg(argc, argv, "-d");
     if (dev.compare("serial") == 0) {
-      dev_type = mgard_x::device_type::Serial;
-      std::cout << mgard_x::log::log_info << "device type: Serial\n";
+      dev_type = mgard_x::device_type::SERIAL;
+      std::cout << mgard_x::log::log_info << "device type: SERIAL\n";
     } else if (dev.compare("cuda") == 0) {
       dev_type = mgard_x::device_type::CUDA;
       std::cout << mgard_x::log::log_info << "device type: CUDA\n";
     } else if (dev.compare("hip") == 0) {
       dev_type = mgard_x::device_type::HIP;
       std::cout << mgard_x::log::log_info << "device type: HIP\n";
+    } else if (dev.compare("sycl") == 0) {
+      dev_type = mgard_x::device_type::SYCL;
+      std::cout << mgard_x::log::log_info << "device type: SYCL\n";
     } else {
       std::cout << "wrong device type.\n";
       exit(-1);
@@ -219,14 +229,17 @@ int main(int argc, char *argv[]) {
   } else {
     std::cout << mgard_x::log::log_info << "Full automatic mode\n";
     std::vector<mgard_x::SIZE> shape({513, 513, 513});
-#ifdef MGARD_ENABLE_SERIAL
-    autotuning(mgard_x::device_type::Serial, shape);
+#if MGARD_ENABLE_SERIAL
+    autotuning(mgard_x::device_type::SERIAL, shape);
 #endif
-#ifdef MGARD_ENABLE_CUDA
+#if MGARD_ENABLE_CUDA
     autotuning(mgard_x::device_type::CUDA, shape);
 #endif
-#ifdef MGARD_ENABLE_HIP
+#if MGARD_ENABLE_HIP
     autotuning(mgard_x::device_type::HIP, shape);
+#endif
+#if MGARD_ENABLE_SYCL
+    autotuning(mgard_x::device_type::SYCL, shape);
 #endif
   }
   return 0;
diff --git a/src/mgard-x/Executables/mgard-x.cpp b/src/mgard-x/Executables/mgard-x.cpp
index 195822a468..eddea57f20 100644
--- a/src/mgard-x/Executables/mgard-x.cpp
+++ b/src/mgard-x/Executables/mgard-x.cpp
@@ -471,17 +471,20 @@ bool try_compression(int argc, char *argv[]) {
   enum mgard_x::device_type dev_type;
   std::string dev = get_arg(argc, argv, "-d");
   if (dev.compare("auto") == 0) {
-    dev_type = mgard_x::device_type::Auto;
-    std::cout << mgard_x::log::log_info << "device type: Auto\n";
+    dev_type = mgard_x::device_type::AUTO;
+    std::cout << mgard_x::log::log_info << "device type: AUTO\n";
   } else if (dev.compare("serial") == 0) {
-    dev_type = mgard_x::device_type::Serial;
-    std::cout << mgard_x::log::log_info << "device type: Serial\n";
+    dev_type = mgard_x::device_type::SERIAL;
+    std::cout << mgard_x::log::log_info << "device type: SERIAL\n";
   } else if (dev.compare("cuda") == 0) {
     dev_type = mgard_x::device_type::CUDA;
     std::cout << mgard_x::log::log_info << "device type: CUDA\n";
   } else if (dev.compare("hip") == 0) {
     dev_type = mgard_x::device_type::HIP;
     std::cout << mgard_x::log::log_info << "device type: HIP\n";
+  } else if (dev.compare("sycl") == 0) {
+    dev_type = mgard_x::device_type::SYCL;
+    std::cout << mgard_x::log::log_info << "device type: SYCL\n";
   } else {
     print_usage_message("wrong device type.");
   }
@@ -517,17 +520,20 @@ bool try_decompression(int argc, char *argv[]) {
   enum mgard_x::device_type dev_type;
   std::string dev = get_arg(argc, argv, "-d");
   if (dev.compare("auto") == 0) {
-    dev_type = mgard_x::device_type::Auto;
-    std::cout << mgard_x::log::log_info << "device type: Auto\n";
+    dev_type = mgard_x::device_type::AUTO;
+    std::cout << mgard_x::log::log_info << "device type: AUTO\n";
   } else if (dev.compare("serial") == 0) {
-    dev_type = mgard_x::device_type::Serial;
-    std::cout << mgard_x::log::log_info << "device type: Serial\n";
+    dev_type = mgard_x::device_type::SERIAL;
+    std::cout << mgard_x::log::log_info << "device type: SERIAL\n";
   } else if (dev.compare("cuda") == 0) {
     dev_type = mgard_x::device_type::CUDA;
     std::cout << mgard_x::log::log_info << "device type: CUDA\n";
   } else if (dev.compare("hip") == 0) {
     dev_type = mgard_x::device_type::HIP;
     std::cout << mgard_x::log::log_info << "device type: HIP\n";
+  } else if (dev.compare("sycl") == 0) {
+    dev_type = mgard_x::device_type::HIP;
+    std::cout << mgard_x::log::log_info << "device type: SYCL\n";
   } else {
     print_usage_message("wrong device type.");
   }
diff --git a/src/mgard-x/HighLevelAPI/CMakeLists.txt b/src/mgard-x/HighLevelAPI/CMakeLists.txt
deleted file mode 100644
index 5a55fee427..0000000000
--- a/src/mgard-x/HighLevelAPI/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-if(MGARD_ENABLE_SERIAL)
-  add_subdirectory (Serial)
-  set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
-endif()
-if(MGARD_ENABLE_CUDA)
-  add_subdirectory (CUDA)
-  set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
-endif()
-if(MGARD_ENABLE_HIP)
-  add_subdirectory (HIP)
-  set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
-endif()
-
-list(APPEND MGARD_X_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/DynamicAPI.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/Metadata.cpp)
-
-set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE)
diff --git a/src/mgard-x/HighLevelAPI/CUDA/CMakeLists.txt b/src/mgard-x/HighLevelAPI/CUDA/CMakeLists.txt
deleted file mode 100644
index b6c2bfe8ea..0000000000
--- a/src/mgard-x/HighLevelAPI/CUDA/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-list(APPEND MGARD_X_CUDA_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.cu)
-
-set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/HighLevelAPI/CUDA/HighLevelAPI.cu b/src/mgard-x/HighLevelAPI/CUDA/HighLevelAPI.cu
deleted file mode 100644
index 6dae3be9a4..0000000000
--- a/src/mgard-x/HighLevelAPI/CUDA/HighLevelAPI.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/HighLevelAPI.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void compress<CUDA>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                             double tol, double s, enum error_bound_type mode,
-                             const void *original_data, void *&compressed_data,
-                             size_t &compressed_size, Config config,
-                             bool output_pre_allocated);
-
-template void compress<CUDA>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                             double tol, double s, enum error_bound_type mode,
-                             const void *original_data, void *&compressed_data,
-                             size_t &compressed_size,
-                             bool output_pre_allocated);
-
-template void compress<CUDA>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                             double tol, double s, enum error_bound_type mode,
-                             const void *original_data, void *&compressed_data,
-                             size_t &compressed_size,
-                             std::vector<const Byte *> coords, Config config,
-                             bool output_pre_allocated);
-
-template void compress<CUDA>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                             double tol, double s, enum error_bound_type mode,
-                             const void *original_data, void *&compressed_data,
-                             size_t &compressed_size,
-                             std::vector<const Byte *> coords,
-                             bool output_pre_allocated);
-
-template void decompress<CUDA>(const void *compressed_data,
-                               size_t compressed_size, void *&decompressed_data,
-                               Config config, bool output_pre_allocated);
-
-template void decompress<CUDA>(const void *compressed_data,
-                               size_t compressed_size, void *&decompressed_data,
-                               bool output_pre_allocated);
-
-template void decompress<CUDA>(const void *compressed_data,
-                               size_t compressed_size, void *&decompressed_data,
-                               data_type &dtype,
-                               std::vector<mgard_x::SIZE> &shape, Config config,
-                               bool output_pre_allocated);
-
-template void decompress<CUDA>(const void *compressed_data,
-                               size_t compressed_size, void *&decompressed_data,
-                               data_type &dtype,
-                               std::vector<mgard_x::SIZE> &shape,
-                               bool output_pre_allocated);
-
-template void BeginAutoTuning<CUDA>();
-template void EndAutoTuning<CUDA>();
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/HighLevelAPI/HIP/CMakeLists.txt b/src/mgard-x/HighLevelAPI/HIP/CMakeLists.txt
deleted file mode 100644
index 553f8fdedb..0000000000
--- a/src/mgard-x/HighLevelAPI/HIP/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-list(APPEND MGARD_X_HIP_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.cpp)
-
-set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/HighLevelAPI/HIP/HighLevelAPI.cpp b/src/mgard-x/HighLevelAPI/HIP/HighLevelAPI.cpp
deleted file mode 100644
index ba9672cd3b..0000000000
--- a/src/mgard-x/HighLevelAPI/HIP/HighLevelAPI.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/HighLevelAPI.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void compress<HIP>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                            double tol, double s, enum error_bound_type mode,
-                            const void *original_data, void *&compressed_data,
-                            size_t &compressed_size, Config config,
-                            bool output_pre_allocated);
-
-template void compress<HIP>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                            double tol, double s, enum error_bound_type mode,
-                            const void *original_data, void *&compressed_data,
-                            size_t &compressed_size, bool output_pre_allocated);
-
-template void compress<HIP>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                            double tol, double s, enum error_bound_type mode,
-                            const void *original_data, void *&compressed_data,
-                            size_t &compressed_size,
-                            std::vector<const Byte *> coords, Config config,
-                            bool output_pre_allocated);
-
-template void compress<HIP>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                            double tol, double s, enum error_bound_type mode,
-                            const void *original_data, void *&compressed_data,
-                            size_t &compressed_size,
-                            std::vector<const Byte *> coords,
-                            bool output_pre_allocated);
-
-template void decompress<HIP>(const void *compressed_data,
-                              size_t compressed_size, void *&decompressed_data,
-                              Config config, bool output_pre_allocated);
-
-template void decompress<HIP>(const void *compressed_data,
-                              size_t compressed_size, void *&decompressed_data,
-                              bool output_pre_allocated);
-
-template void decompress<HIP>(const void *compressed_data,
-                              size_t compressed_size, void *&decompressed_data,
-                              data_type &dtype,
-                              std::vector<mgard_x::SIZE> &shape, Config config,
-                              bool output_pre_allocated);
-
-template void decompress<HIP>(const void *compressed_data,
-                              size_t compressed_size, void *&decompressed_data,
-                              data_type &dtype,
-                              std::vector<mgard_x::SIZE> &shape,
-                              bool output_pre_allocated);
-
-template void BeginAutoTuning<HIP>();
-template void EndAutoTuning<HIP>();
-
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/HighLevelAPI/Metadata.cpp b/src/mgard-x/HighLevelAPI/Metadata.cpp
deleted file mode 100644
index 64e292512e..0000000000
--- a/src/mgard-x/HighLevelAPI/Metadata.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <numeric>
-#include <vector>
-
-// #include "compress_cuda.hpp"
-#include "mgard-x/Hierarchy.h"
-#include "mgard-x/Metadata.hpp"
-#include "mgard-x/RuntimeX/RuntimeXPublic.h"
-
-namespace mgard_x {
-
-// bool verify(const void *compressed_data, size_t compressed_size) {
-//   if (compressed_size < SIGNATURE_SIZE)
-//     return false;
-//   Metadata meta;
-//   meta.Deserialize((SERIALIZED_TYPE *)compressed_data);
-//   for (size_t i = 0; i < SIGNATURE_SIZE; i++) {
-//     if (meta.signature[i] != meta.mgard_signature[i]) {
-//       return false;
-//     }
-//   }
-//   return true;
-// }
-
-// enum data_type infer_data_type(const void *compressed_data,
-//                                size_t compressed_size) {
-//   if (!verify(compressed_data, compressed_size)) {
-//     std::cout << log::log_err << "cannot verify the data!\n";
-//     exit(-1);
-//   }
-//   Metadata meta;
-//   meta.Deserialize((SERIALIZED_TYPE *)compressed_data);
-//   return meta.dtype;
-// }
-
-// std::vector<SIZE> infer_shape(const void *compressed_data,
-//                               size_t compressed_size) {
-//   if (!verify(compressed_data, compressed_size)) {
-//     std::cout << log::log_err << "cannot verify the data!\n";
-//     exit(-1);
-//   }
-
-//   Metadata meta;
-//   meta.Deserialize((SERIALIZED_TYPE *)compressed_data);
-//   std::vector<SIZE> shape(meta.total_dims);
-//   for (DIM d = 0; d < meta.total_dims; d++) {
-//     shape[d] = (SIZE)meta.shape[d];
-//   }
-//   return shape;
-// }
-
-// enum data_structure_type infer_data_structure(const void *compressed_data,
-//                                               size_t compressed_size) {
-//   if (!verify(compressed_data, compressed_size)) {
-//     std::cout << log::log_err << "cannot verify the data!\n";
-//     exit(-1);
-//   }
-//   Metadata meta;
-//   meta.Deserialize((SERIALIZED_TYPE *)compressed_data);
-//   return meta.dstype;
-// }
-
-// template <typename T>
-// std::vector<T *> infer_coords(const void *compressed_data,
-//                               size_t compressed_size) {
-//   if (!verify(compressed_data, compressed_size)) {
-//     std::cout << log::log_err << "cannot verify the data!\n";
-//     exit(-1);
-//   }
-//   Metadata meta;
-//   meta.Deserialize((SERIALIZED_TYPE *)compressed_data);
-//   std::vector<SIZE> shape(meta.total_dims);
-//   for (DIM d = 0; d < meta.total_dims; d++) {
-//     shape[d] = (SIZE)meta.shape[d];
-//   }
-//   std::vector<T *> coords(meta.total_dims);
-//   for (DIM d = 0; d < meta.total_dims; d++) {
-//     coords[d] = (T *)std::malloc(shape[d] * sizeof(T));
-//     for (SIZE i = 0; i < shape[d]; i++) {
-//       coords[d][i] = (T)meta.coords[d][i];
-//     }
-//   }
-//   return coords;
-// }
-
-// template std::vector<float *> infer_coords(const void *compressed_data,
-//                                            size_t compressed_size);
-// template std::vector<double *> infer_coords(const void *compressed_data,
-//                                             size_t compressed_size);
-
-// std::string infer_nonuniform_coords_file(const void *compressed_data,
-//                                          size_t compressed_size) {
-//   if (!verify(compressed_data, compressed_size)) {
-//     std::cout << log::log_err << "cannot verify the data!\n";
-//     exit(-1);
-//   }
-//   Metadata meta;
-//   meta.Deserialize((SERIALIZED_TYPE *)compressed_data);
-//   return std::string(meta.nonuniform_coords_file);
-// }
-
-// bool infer_domain_decomposed(const void *compressed_data,
-//                              size_t compressed_size) {
-//   if (!verify(compressed_data, compressed_size)) {
-//     std::cout << log::log_err << "cannot verify the data!\n";
-//     exit(-1);
-//   }
-//   Metadata meta;
-//   meta.Deserialize((SERIALIZED_TYPE *)compressed_data);
-//   return meta.domain_decomposed;
-// }
-
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/HighLevelAPI/Serial/CMakeLists.txt b/src/mgard-x/HighLevelAPI/Serial/CMakeLists.txt
deleted file mode 100644
index ffa7bbf66f..0000000000
--- a/src/mgard-x/HighLevelAPI/Serial/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-list(APPEND MGARD_X_SERIAL_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.cpp)
-
-set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/HighLevelAPI/Serial/HighLevelAPI.cpp b/src/mgard-x/HighLevelAPI/Serial/HighLevelAPI.cpp
deleted file mode 100644
index dd62370aba..0000000000
--- a/src/mgard-x/HighLevelAPI/Serial/HighLevelAPI.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/HighLevelAPI.hpp"
-
-#include <iostream>
-
-#include <chrono>
-namespace mgard_x {
-
-template void compress<Serial>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                               double tol, double s, enum error_bound_type mode,
-                               const void *original_data,
-                               void *&compressed_data, size_t &compressed_size,
-                               Config config, bool output_pre_allocated);
-
-template void compress<Serial>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                               double tol, double s, enum error_bound_type mode,
-                               const void *original_data,
-                               void *&compressed_data, size_t &compressed_size,
-                               bool output_pre_allocated);
-
-template void compress<Serial>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                               double tol, double s, enum error_bound_type mode,
-                               const void *original_data,
-                               void *&compressed_data, size_t &compressed_size,
-                               std::vector<const Byte *> coords, Config config,
-                               bool output_pre_allocated);
-
-template void compress<Serial>(DIM D, data_type dtype, std::vector<SIZE> shape,
-                               double tol, double s, enum error_bound_type mode,
-                               const void *original_data,
-                               void *&compressed_data, size_t &compressed_size,
-                               std::vector<const Byte *> coords,
-                               bool output_pre_allocated);
-
-template void decompress<Serial>(const void *compressed_data,
-                                 size_t compressed_size,
-                                 void *&decompressed_data, Config config,
-                                 bool output_pre_allocated);
-
-template void decompress<Serial>(const void *compressed_data,
-                                 size_t compressed_size,
-                                 void *&decompressed_data,
-                                 bool output_pre_allocated);
-
-template void decompress<Serial>(const void *compressed_data,
-                                 size_t compressed_size,
-                                 void *&decompressed_data, data_type &dtype,
-                                 std::vector<mgard_x::SIZE> &shape,
-                                 Config config, bool output_pre_allocated);
-
-template void decompress<Serial>(const void *compressed_data,
-                                 size_t compressed_size,
-                                 void *&decompressed_data, data_type &dtype,
-                                 std::vector<mgard_x::SIZE> &shape,
-                                 bool output_pre_allocated);
-
-template void BeginAutoTuning<Serial>();
-template void EndAutoTuning<Serial>();
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/LosslessCompression.cu b/src/mgard-x/LosslessCompression.cu
deleted file mode 100644
index d9048fca54..0000000000
--- a/src/mgard-x/LosslessCompression.cu
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "compressors.hpp"
-#include "cuda/Common.h"
-#include "cuda/CommonInternal.h"
-#include "cuda/LosslessCompression.h"
-#include "cuda/ParallelHuffman/huffman_workflow.cuh"
-// #include "cuda/ParallelHuffman/Huffman.hpp"
-
-#include <typeinfo>
-
-namespace mgard_x {
-
-template <uint32_t D, typename T, typename C>
-void cascaded_compress(Handle<D, T> &handle, C *input_data, size_t intput_count,
-                       void *&output_data, size_t &output_size, int n_rle,
-                       int n_de, bool bitpack, int queue_idx) {
-
-  nvcomp::CascadedCompressor compressor(nvcomp::TypeOf<C>(), n_rle, n_de,
-                                        bitpack);
-
-  size_t *temp_bytes;
-  cudaMallocHostHelper((void **)&temp_bytes, sizeof(size_t));
-  size_t *output_bytes;
-  cudaMallocHostHelper((void **)&output_bytes, sizeof(size_t));
-
-  compressor.configure(intput_count * sizeof(C), temp_bytes, output_bytes);
-
-  void *temp_space;
-  cudaMallocHelper(handle, &temp_space, *temp_bytes);
-  cudaMallocHelper(handle, &output_data, *output_bytes);
-
-  compressor.compress_async(input_data, intput_count * sizeof(C), temp_space,
-                            *temp_bytes, output_data, output_bytes,
-                            *(cudaStream_t *)handle.get(queue_idx));
-  handle.sync(queue_idx);
-  output_size = *output_bytes;
-  cudaFreeHelper(temp_space);
-  cudaFreeHostHelper(temp_bytes);
-  cudaFreeHostHelper(output_bytes);
-}
-
-template <uint32_t D, typename T, typename C>
-void cascaded_decompress(Handle<D, T> &handle, void *input_data,
-                         size_t input_size, C *&output_data, int queue_idx) {
-
-  // nvcomp::Decompressor<C> decompressor(input_data, input_size,
-  //                                      *(cudaStream_t
-  //                                      *)handle.get(queue_idx));
-
-  nvcomp::CascadedDecompressor decompressor;
-
-  size_t *temp_bytes;
-  cudaMallocHostHelper((void **)&temp_bytes, sizeof(size_t));
-  size_t *output_bytes;
-  cudaMallocHostHelper((void **)&output_bytes, sizeof(size_t));
-
-  decompressor.configure(input_data, input_size, temp_bytes, output_bytes,
-                         *(cudaStream_t *)handle.get(queue_idx));
-
-  void *temp_space;
-  cudaMallocHelper(handle, (void **)&temp_space, *temp_bytes);
-  cudaMallocHelper(handle, (void **)&output_data, *output_bytes);
-
-  decompressor.decompress_async(input_data, input_size, temp_space, *temp_bytes,
-                                output_data, *output_bytes,
-                                *(cudaStream_t *)handle.get(queue_idx));
-  handle.sync(queue_idx);
-  cudaFreeHelper(temp_space);
-  cudaFreeHostHelper(temp_bytes);
-  cudaFreeHostHelper(output_bytes);
-}
-
-template <uint32_t D, typename T, typename C>
-void lz4_compress(Handle<D, T> &handle, C *input_data, size_t input_count,
-                  void *&output_data, size_t &output_size, size_t chunk_size,
-                  int queue_idx) {
-  nvcompType_t dtype = NVCOMP_TYPE_UCHAR;
-  nvcomp::LZ4Compressor compressor(chunk_size, dtype);
-
-  size_t *temp_bytes;
-  cudaMallocHostHelper((void **)&temp_bytes, sizeof(size_t));
-  size_t *output_bytes;
-  cudaMallocHostHelper((void **)&output_bytes, sizeof(size_t));
-
-  compressor.configure(input_count * sizeof(C), temp_bytes, output_bytes);
-
-  void *temp_space;
-  cudaMallocHelper(handle, &temp_space, *temp_bytes);
-  cudaMallocHelper(handle, &output_data, *output_bytes);
-
-  compressor.compress_async(input_data, input_count * sizeof(C), temp_space,
-                            *temp_bytes, output_data, output_bytes,
-                            *(cudaStream_t *)handle.get(queue_idx));
-
-  handle.sync(queue_idx);
-  output_size = *output_bytes;
-  cudaFreeHelper(temp_space);
-  cudaFreeHostHelper(temp_bytes);
-  cudaFreeHostHelper(output_bytes);
-}
-
-template <uint32_t D, typename T, typename C>
-void lz4_decompress(Handle<D, T> &handle, void *input_data, size_t input_size,
-                    C *&output_data, size_t &output_size, int queue_idx) {
-
-  nvcomp::LZ4Decompressor decompressor;
-
-  size_t *temp_bytes;
-  cudaMallocHostHelper((void **)&temp_bytes, sizeof(size_t));
-  size_t *output_bytes;
-  cudaMallocHostHelper((void **)&output_bytes, sizeof(size_t));
-
-  decompressor.configure(input_data, input_size, temp_bytes, output_bytes,
-                         *(cudaStream_t *)handle.get(queue_idx));
-
-  void *temp_space;
-  cudaMallocHelper(handle, (void **)&temp_space, *temp_bytes);
-  cudaMallocHelper(handle, (void **)&output_data, *output_bytes);
-
-  decompressor.decompress_async(input_data, input_size, temp_space, *temp_bytes,
-                                output_data, *output_bytes,
-                                *(cudaStream_t *)handle.get(queue_idx));
-  handle.sync(queue_idx);
-  output_size = *output_bytes;
-  cudaFreeHelper(temp_space);
-  cudaFreeHostHelper(temp_bytes);
-  cudaFreeHostHelper(output_bytes);
-}
-
-#define KERNELS(D, T, C)                                                       \
-  template void cascaded_compress<D, T, C>(                                    \
-      Handle<D, T> & handle, C * input_data, size_t intput_count,              \
-      void *&output_data, size_t &output_size, int n_rle, int n_de,            \
-      bool bitpack, int queue_idx);                                            \
-  template void cascaded_decompress<D, T, C>(                                  \
-      Handle<D, T> & handle, void *input_data, size_t input_size,              \
-      C *&output_data, int queue_idx);                                         \
-  template void lz4_compress<D, T, C>(Handle<D, T> & handle, C * input_data,   \
-                                      size_t input_count, void *&output_data,  \
-                                      size_t &output_size, size_t chunk_size,  \
-                                      int queue_idx);                          \
-  template void lz4_decompress<D, T, C>(                                       \
-      Handle<D, T> & handle, void *input_data, size_t input_size,              \
-      C *&output_data, size_t &output_count, int queue_idx);
-
-KERNELS(1, double, uint8_t)
-KERNELS(1, float, uint8_t)
-KERNELS(2, double, uint8_t)
-KERNELS(2, float, uint8_t)
-KERNELS(3, double, uint8_t)
-KERNELS(3, float, uint8_t)
-KERNELS(4, double, uint8_t)
-KERNELS(4, float, uint8_t)
-KERNELS(5, double, uint8_t)
-KERNELS(5, float, uint8_t)
-KERNELS(1, double, uint32_t)
-KERNELS(1, float, uint32_t)
-KERNELS(2, double, uint32_t)
-KERNELS(2, float, uint32_t)
-KERNELS(3, double, uint32_t)
-KERNELS(3, float, uint32_t)
-KERNELS(4, double, uint32_t)
-KERNELS(4, float, uint32_t)
-KERNELS(5, double, uint32_t)
-KERNELS(5, float, uint32_t)
-KERNELS(1, double, uint64_t)
-KERNELS(1, float, uint64_t)
-KERNELS(2, double, uint64_t)
-KERNELS(2, float, uint64_t)
-KERNELS(3, double, uint64_t)
-KERNELS(3, float, uint64_t)
-KERNELS(4, double, uint64_t)
-KERNELS(4, float, uint64_t)
-KERNELS(5, double, uint64_t)
-KERNELS(5, float, uint64_t)
-#undef KERNELS
-
-template <uint32_t D, typename T, typename S, typename Q>
-void SeparateOutlierAndPrimary(Handle<D, T> &handle, S *dqv, size_t n,
-                               size_t *outlier_idx, size_t outlier_count,
-                               size_t primary_count, S *doutlier, Q *dprimary,
-                               int queue_idx) {
-
-  // printf("compress outlier_idx: "); for(int i = 0; i < outlier_count; i++)
-  // {printf("%llu ", outlier_idx[i]);} printf("\n");
-  printf("compress outlier_count: %llu\n", outlier_count);
-  printf("compress primary_count: %llu\n", primary_count);
-  printf("start separating primary and outlier\n");
-
-  size_t p = 0;
-  size_t pp = 0;
-  size_t op = 0;
-  size_t size = outlier_idx[0] - 0;
-  // printf("copy primary\n");
-  if (size > 0) {
-    mgard_x::cudaMemcpyAsyncHelper(handle, dprimary + pp, dqv + p,
-                                   size * sizeof(Q), mgard_x::D2D, queue_idx);
-  }
-  pp += size;
-  p += size;
-
-  for (int i = 0; i < outlier_count - 1; i++) {
-    size = 1;
-    // printf("copy outlier\n");
-    mgard_x::cudaMemcpyAsyncHelper(handle, doutlier + op, dqv + p,
-                                   size * sizeof(S), mgard_x::D2D, queue_idx);
-    op += size;
-    p += size;
-    size = outlier_idx[i + 1] - outlier_idx[i] - 1;
-    // printf("copy primary %d %d %d\n", p, size, outlier_idx[outlier_idx.size()
-    // - 1]);
-    if (size > 0) {
-      mgard_x::cudaMemcpyAsyncHelper(handle, dprimary + pp, dqv + p,
-                                     size * sizeof(Q), mgard_x::D2D, queue_idx);
-    }
-    pp += size;
-    p += size;
-  }
-  size = 1;
-  // printf("copy outlier\n");
-  mgard_x::cudaMemcpyAsyncHelper(handle, doutlier + op, dqv + p,
-                                 size * sizeof(S), mgard_x::D2D, queue_idx);
-  op += size;
-  p += size;
-  size = n - outlier_idx[outlier_count - 1] - 1;
-  // printf("copy primary %d %d %d\n", p, size, outlier_idx[outlier_idx.size() -
-  // 1]);
-  if (size > 0) {
-    mgard_x::cudaMemcpyAsyncHelper(handle, dprimary + pp, dqv + p,
-                                   size * sizeof(Q), mgard_x::D2D, queue_idx);
-  }
-  // printf("done copy primary\n");
-  pp += size;
-  p += size;
-
-  if (pp != primary_count || op != outlier_count) {
-    printf("Primary or outlier size mismatch!\n");
-  }
-  printf("done separating primary and outlier\n");
-}
-
-template <uint32_t D, typename T, typename S, typename Q>
-void CombineOutlierAndPrimary(Handle<D, T> &handle, S *dqv, size_t n,
-                              size_t *outlier_idx, size_t outlier_count,
-                              size_t primary_count, S *doutlier, Q *dprimary,
-                              int queue_idx) {
-  size_t p = 0;
-  size_t pp = 0;
-  size_t op = 0;
-  size_t size = outlier_idx[0] - 0;
-  // printf("copy primary\n");
-  if (size > 0) {
-    mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, dprimary + pp,
-                                   size * sizeof(Q), mgard_x::D2D, queue_idx);
-  }
-  pp += size;
-  p += size;
-
-  for (int i = 0; i < outlier_count - 1; i++) {
-    size = 1;
-    // printf("copy outlier\n");
-    mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, doutlier + op,
-                                   size * sizeof(S), mgard_x::D2D, queue_idx);
-    op += size;
-    p += size;
-    size = outlier_idx[i + 1] - outlier_idx[i] - 1;
-    // printf("copy primary %d %d %d\n", p, size, outlier_idx[outlier_idx.size()
-    // - 1]);
-    if (size > 0) {
-      mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, dprimary + pp,
-                                     size * sizeof(Q), mgard_x::D2D, queue_idx);
-    }
-    pp += size;
-    p += size;
-  }
-  size = 1;
-  // printf("copy outlier\n");
-  mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, doutlier + op,
-                                 size * sizeof(S), mgard_x::D2D, queue_idx);
-  op += size;
-  p += size;
-  size = n - outlier_idx[outlier_count - 1] - 1;
-  // printf("copy primary %d %d %d\n", p, size, outlier_idx[outlier_idx.size() -
-  // 1]);
-  if (size > 0) {
-    mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, dprimary + pp,
-                                   size * sizeof(Q), mgard_x::D2D, queue_idx);
-  }
-  // printf("done copy primary\n");
-  pp += size;
-  p += size;
-}
-
-#define KERNELS(D, T, S, Q)                                                    \
-  template void SeparateOutlierAndPrimary<D, T, S, Q>(                         \
-      Handle<D, T> & handle, S * dqv, size_t n, size_t * outlier_idx,\ 
-          size_t outlier_count,                                                \
-      size_t primary_count,\ 
-          S * doutlier,                                                        \
-      Q * dprimary, int queue_idx);                                            \
-  template void CombineOutlierAndPrimary<D, T, S, Q>(                          \
-      Handle<D, T> & handle, S * dqv, size_t n, size_t * outlier_idx,\ 
-          size_t outlier_count,                                                \
-      size_t primary_count,\ 
-          S * doutlier,                                                        \
-      Q * dprimary, int queue_idx);
-
-KERNELS(1, double, int, uint32_t)
-KERNELS(1, float, int, uint32_t)
-KERNELS(2, double, int, uint32_t)
-KERNELS(2, float, int, uint32_t)
-KERNELS(3, double, int, uint32_t)
-KERNELS(3, float, int, uint32_t)
-KERNELS(4, double, int, uint32_t)
-KERNELS(4, float, int, uint32_t)
-KERNELS(5, double, int, uint32_t)
-KERNELS(5, float, int, uint32_t)
-#undef KERNELS
-
-template <uint32_t D, typename T, typename S, typename Q, typename H,
-          typename DeviceType>
-void huffman_compress(Handle<D, T> &handle, S *input_data, size_t input_count,
-                      std::vector<size_t> &outlier_idx, H *&out_meta,
-                      size_t &out_meta_size, H *&out_data,
-                      size_t &out_data_size, int chunk_size, int dict_size,
-                      int queue_idx) {
-
-  // HuffmanEncode<D, T, S, Q, H, DeviceType>(handle, input_data, input_count,
-  // outlier_idx,
-  //                              out_meta, out_meta_size, out_data,
-  //                              out_data_size, chunk_size, dict_size);
-}
-
-template <uint32_t D, typename T, typename S, typename Q, typename H,
-          typename DeviceType>
-void huffman_decompress(Handle<D, T> &handle, H *in_meta, size_t in_meta_size,
-                        H *in_data, size_t in_data_size, S *&output_data,
-                        size_t &output_count, int queue_idx) {
-  // HuffmanDecode<D, T, S, Q, H, DeviceType>(handle, output_data, output_count,
-  // in_meta,
-  //                              in_meta_size, in_data, in_data_size);
-}
-
-#define KERNELS(D, T, S, Q, H)                                                 \
-  template void huffman_compress<D, T, S, Q, H, CUDA>(                         \
-      Handle<D, T> & handle, S * input_data, size_t input_count,               \
-      std::vector<size_t> & outlier_idx, H * &out_meta,                        \
-      size_t & out_meta_size, H * &out_data, size_t & out_data_size,           \
-      int chunk_size, int dict_size, int queue_idx);                           \
-  template void huffman_decompress<D, T, S, Q, H, CUDA>(                       \
-      Handle<D, T> & handle, H * in_meta, size_t in_meta_size, H * in_data,    \
-      size_t in_data_size, S * &output_data, size_t & output_count,            \
-      int queue_idx);
-
-KERNELS(1, double, int, uint32_t, uint32_t)
-KERNELS(1, float, int, uint32_t, uint32_t)
-KERNELS(2, double, int, uint32_t, uint32_t)
-KERNELS(2, float, int, uint32_t, uint32_t)
-KERNELS(3, double, int, uint32_t, uint32_t)
-KERNELS(3, float, int, uint32_t, uint32_t)
-KERNELS(4, double, int, uint32_t, uint32_t)
-KERNELS(4, float, int, uint32_t, uint32_t)
-KERNELS(5, double, int, uint32_t, uint32_t)
-KERNELS(5, float, int, uint32_t, uint32_t)
-KERNELS(1, double, int, uint32_t, uint64_t)
-KERNELS(1, float, int, uint32_t, uint64_t)
-KERNELS(2, double, int, uint32_t, uint64_t)
-KERNELS(2, float, int, uint32_t, uint64_t)
-KERNELS(3, double, int, uint32_t, uint64_t)
-KERNELS(3, float, int, uint32_t, uint64_t)
-KERNELS(4, double, int, uint32_t, uint64_t)
-KERNELS(4, float, int, uint32_t, uint64_t)
-KERNELS(5, double, int, uint32_t, uint64_t)
-KERNELS(5, float, int, uint32_t, uint64_t)
-
-template <uint32_t D, typename T, typename S, typename H>
-void cpu_lossless_compression(Handle<D, T> &handle, S *input_data,
-                              size_t input_count, H *&out_data,
-                              size_t &out_data_size) {
-
-  int *int_vector = new int[input_count];
-
-  cudaMemcpyAsyncHelper(handle, int_vector, input_data, input_count * sizeof(S),
-                        AUTO, 0);
-  handle.sync(0);
-
-  std::vector<long int> input_vector(input_count);
-  for (int i = 0; i < input_count; i++)
-    input_vector[i] = int_vector[i];
-
-  // printf("%u %u\n", sizeof(long int), sizeof(int));
-  // printf("dqv\n");
-  // print_matrix_cuda(1, input_count, input_data, input_count);
-
-  // printf("input_vector: ");
-  // for (int i = 0; i < input_vector.size(); i++) printf("%d ",
-  // input_vector[i]); printf("\n"); Compress an array of data using `zstd`.
-  std::size_t zstd_outsize;
-
-  void *const buffer =
-      mgard::compress_memory_huffman(input_vector, zstd_outsize);
-
-  out_data_size = zstd_outsize;
-
-  cudaMallocHelper(handle, (void **)&out_data, out_data_size);
-  cudaMemcpyAsyncHelper(handle, out_data, buffer, out_data_size, AUTO, 0);
-  handle.sync(0);
-  delete[] int_vector;
-}
-
-template <uint32_t D, typename T, typename S, typename H>
-void cpu_lossless_decompression(Handle<D, T> &handle, H *input_data,
-                                size_t input_count, S *&out_data,
-                                size_t output_count) {
-
-  // printf("cpu decompression: %llu\n", input_count);
-  std::vector<unsigned char> input_vector(input_count);
-  cudaMemcpyAsyncHelper(handle, input_vector.data(), input_data, input_count,
-                        AUTO, 0);
-  handle.sync(0);
-  // printf("copy done\n");
-
-  long int *output_vector = new long int[output_count];
-  int *int_vector = new int[output_count];
-
-  mgard::decompress_memory_huffman(
-      reinterpret_cast<unsigned char *>(input_vector.data()),
-      input_vector.size(), output_vector,
-      output_count * sizeof(*output_vector));
-
-  for (int i = 0; i < output_count; i++)
-    int_vector[i] = output_vector[i];
-  cudaMallocHelper(handle, (void **)&out_data, output_count * sizeof(S));
-  cudaMemcpyAsyncHelper(handle, out_data, int_vector, output_count * sizeof(S),
-                        AUTO, 0);
-  handle.sync(0);
-  delete[] output_vector;
-  delete[] int_vector;
-
-  // printf("dqv\n");
-  // print_matrix_cuda(1, output_count, out_data, output_count);
-}
-
-#define KERNELS(D, T, S, H)                                                    \
-  template void cpu_lossless_compression<D, T, S, H>(                          \
-      Handle<D, T> & handle, S * input_data, size_t input_count,               \
-      H * &out_data, size_t & out_data_size);                                  \
-  template void cpu_lossless_decompression<D, T, S, H>(                        \
-      Handle<D, T> & handle, H * input_data, size_t input_count,               \
-      S * &out_data, size_t output_count);
-
-KERNELS(1, double, int, unsigned char)
-KERNELS(1, float, int, unsigned char)
-KERNELS(2, double, int, unsigned char)
-KERNELS(2, float, int, unsigned char)
-KERNELS(3, double, int, unsigned char)
-KERNELS(3, float, int, unsigned char)
-KERNELS(4, double, int, unsigned char)
-KERNELS(4, float, int, unsigned char)
-KERNELS(5, double, int, unsigned char)
-KERNELS(5, float, int, unsigned char)
-
-} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerCuda.cu b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerCuda.cu
index 9bb60b6038..363f5c8a7b 100644
--- a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerCuda.cu
+++ b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerCuda.cu
@@ -62,6 +62,9 @@ int AutoTuningTable<CUDA>::lwdqzk[2][9] = {{0, 0, 0, 0, 0, 0, 0, 0, 0},
 int AutoTuningTable<CUDA>::llk[2][9] = {{0, 0, 0, 0, 0, 0, 0, 0, 0},
                                          {0, 0, 0, 0, 0, 0, 0, 0, 0}};
 
+template void BeginAutoTuning<CUDA>();
+template void EndAutoTuning<CUDA>();
+
 } // namespace mgard_x
 // clang-format on
 #undef MGARDX_COMPILE_CUDA
\ No newline at end of file
diff --git a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerHip.cpp b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerHip.cpp
index 3a91e76f7d..9c7254c5c5 100644
--- a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerHip.cpp
+++ b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerHip.cpp
@@ -61,6 +61,10 @@ int AutoTuningTable<HIP>::lwdqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
 
 int AutoTuningTable<HIP>::llk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
                                          {2, 2, 3, 3, 0, 2, 5, 0, 0}};
+
+template void BeginAutoTuning<HIP>();
+template void EndAutoTuning<HIP>();
+
 } // namespace mgard_x
 // clang-format on
 #undef MGARDX_COMPILE_HIP
\ No newline at end of file
diff --git a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.cpp b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.cpp
index f735ef4e2e..58fc4245e6 100644
--- a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.cpp
+++ b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.cpp
@@ -2,65 +2,69 @@
 // clang-format off
 namespace mgard_x {
 
-int AutoTuningTable<Serial>::gpk_reo_3d[2][9] = {{5, 5, 5, 3, 3, 5, 5, 0, 0},
+int AutoTuningTable<SERIAL>::gpk_reo_3d[2][9] = {{5, 5, 5, 3, 3, 5, 5, 0, 0},
                                                  {3, 6, 5, 3, 3, 3, 5, 0, 0}};
 
-int AutoTuningTable<Serial>::gpk_rev_3d[2][9] = {{2, 4, 5, 5, 3, 5, 5, 0, 0},
+int AutoTuningTable<SERIAL>::gpk_rev_3d[2][9] = {{2, 4, 5, 5, 3, 5, 5, 0, 0},
                                                  {3, 6, 6, 5, 3, 5, 6, 0, 0}};
 
-int AutoTuningTable<Serial>::gpk_reo_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0},
+int AutoTuningTable<SERIAL>::gpk_reo_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0},
                                                  {0, 0, 3, 4, 5, 0, 0, 0, 0}};
 
-int AutoTuningTable<Serial>::gpk_rev_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0},
+int AutoTuningTable<SERIAL>::gpk_rev_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0},
                                                  {0, 0, 3, 4, 5, 0, 0, 0, 0}};
 
-int AutoTuningTable<Serial>::lpk1_3d[2][9] = {{4, 4, 1, 1, 1, 1, 1, 0, 0},
+int AutoTuningTable<SERIAL>::lpk1_3d[2][9] = {{4, 4, 1, 1, 1, 1, 1, 0, 0},
                                               {1, 1, 1, 1, 1, 1, 1, 0, 0}};
 
-int AutoTuningTable<Serial>::lpk2_3d[2][9] = {{5, 4, 4, 4, 3, 3, 4, 0, 0},
+int AutoTuningTable<SERIAL>::lpk2_3d[2][9] = {{5, 4, 4, 4, 3, 3, 4, 0, 0},
                                               {4, 1, 1, 1, 1, 1, 3, 0, 0}};
 
-int AutoTuningTable<Serial>::lpk3_3d[2][9] = {{4, 4, 3, 3, 2, 3, 4, 0, 0},
+int AutoTuningTable<SERIAL>::lpk3_3d[2][9] = {{4, 4, 3, 3, 2, 3, 4, 0, 0},
                                               {1, 1, 1, 1, 1, 1, 2, 0, 0}};
 
-int AutoTuningTable<Serial>::lpk1_nd[2][9] = {{2, 0, 1, 1, 1, 0, 0, 0, 0},
+int AutoTuningTable<SERIAL>::lpk1_nd[2][9] = {{2, 0, 1, 1, 1, 0, 0, 0, 0},
                                               {0, 0, 1, 1, 1, 0, 0, 0, 0}};
 
-int AutoTuningTable<Serial>::lpk2_nd[2][9] = {{2, 1, 3, 1, 0, 0, 0, 0, 0},
+int AutoTuningTable<SERIAL>::lpk2_nd[2][9] = {{2, 1, 3, 1, 0, 0, 0, 0, 0},
                                               {0, 2, 1, 1, 0, 0, 0, 0, 0}};
 
-int AutoTuningTable<Serial>::lpk3_nd[2][9] = {{2, 3, 1, 1, 0, 0, 0, 0, 0},
+int AutoTuningTable<SERIAL>::lpk3_nd[2][9] = {{2, 3, 1, 1, 0, 0, 0, 0, 0},
                                               {0, 2, 1, 1, 0, 0, 0, 0, 0}};
 
-int AutoTuningTable<Serial>::ipk1_3d[2][9] = {{3, 3, 4, 5, 5, 3, 4, 0, 0},
+int AutoTuningTable<SERIAL>::ipk1_3d[2][9] = {{3, 3, 4, 5, 5, 3, 4, 0, 0},
                                               {3, 6, 4, 4, 3, 3, 3, 0, 0}};
 
-int AutoTuningTable<Serial>::ipk2_3d[2][9] = {{3, 3, 2, 2, 2, 2, 6, 0, 0},
+int AutoTuningTable<SERIAL>::ipk2_3d[2][9] = {{3, 3, 2, 2, 2, 2, 6, 0, 0},
                                               {2, 2, 2, 2, 2, 2, 5, 0, 0}};
 
-int AutoTuningTable<Serial>::ipk3_3d[2][9] = {{3, 3, 2, 2, 2, 2, 1, 0, 0},
+int AutoTuningTable<SERIAL>::ipk3_3d[2][9] = {{3, 3, 2, 2, 2, 2, 1, 0, 0},
                                               {2, 2, 2, 2, 2, 2, 6, 0, 0}};
 
-int AutoTuningTable<Serial>::ipk1_nd[2][9] = {{0, 2, 3, 3, 0, 0, 0, 0, 0},
+int AutoTuningTable<SERIAL>::ipk1_nd[2][9] = {{0, 2, 3, 3, 0, 0, 0, 0, 0},
                                               {0, 3, 3, 3, 0, 0, 0, 0, 0}};
 
-int AutoTuningTable<Serial>::ipk2_nd[2][9] = {{0, 1, 2, 2, 0, 0, 0, 0, 0},
+int AutoTuningTable<SERIAL>::ipk2_nd[2][9] = {{0, 1, 2, 2, 0, 0, 0, 0, 0},
                                               {0, 2, 2, 2, 0, 0, 0, 0, 0}};
 
-int AutoTuningTable<Serial>::ipk3_nd[2][9] = {{0, 2, 3, 2, 0, 0, 0, 0, 0},
+int AutoTuningTable<SERIAL>::ipk3_nd[2][9] = {{0, 2, 3, 2, 0, 0, 0, 0, 0},
                                               {0, 3, 4, 2, 0, 0, 0, 0, 0}};
 
-int AutoTuningTable<Serial>::lwpk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
+int AutoTuningTable<SERIAL>::lwpk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
                                            {2, 2, 3, 3, 0, 2, 5, 0, 0}};
 
-int AutoTuningTable<Serial>::lwqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
+int AutoTuningTable<SERIAL>::lwqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
                                          {2, 2, 3, 3, 0, 2, 5, 0, 0}};
 
-int AutoTuningTable<Serial>::lwdqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
+int AutoTuningTable<SERIAL>::lwdqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
                                          {2, 2, 3, 3, 0, 2, 5, 0, 0}};  
 
-int AutoTuningTable<Serial>::llk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
+int AutoTuningTable<SERIAL>::llk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
                                          {2, 2, 3, 3, 0, 2, 5, 0, 0}};
+
+template void BeginAutoTuning<SERIAL>();
+template void EndAutoTuning<SERIAL>();
+
 } // namespace mgard_x
 // clang-format on
 #undef MGARDX_COMPILE_SERIAL
\ No newline at end of file
diff --git a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.cpp b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.cpp
new file mode 100644
index 0000000000..4f3abaf39d
--- /dev/null
+++ b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.cpp
@@ -0,0 +1,70 @@
+#include "mgard-x/RuntimeX/RuntimeX.h"
+// clang-format off
+namespace mgard_x {
+
+int AutoTuningTable<SYCL>::gpk_reo_3d[2][9] = {{5, 5, 5, 3, 3, 5, 5, 0, 0},
+                                              {3, 6, 5, 3, 3, 3, 5, 0, 0}};
+
+int AutoTuningTable<SYCL>::gpk_rev_3d[2][9] = {{2, 4, 5, 5, 3, 5, 5, 0, 0},
+                                              {3, 6, 6, 5, 3, 5, 6, 0, 0}};
+
+int AutoTuningTable<SYCL>::gpk_reo_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0},
+                                              {0, 0, 3, 4, 5, 0, 0, 0, 0}};
+
+int AutoTuningTable<SYCL>::gpk_rev_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0},
+                                              {0, 0, 3, 4, 5, 0, 0, 0, 0}};
+
+int AutoTuningTable<SYCL>::lpk1_3d[2][9] = {{4, 4, 1, 1, 1, 1, 1, 0, 0},
+                                           {1, 1, 1, 1, 1, 1, 1, 0, 0}};
+
+int AutoTuningTable<SYCL>::lpk2_3d[2][9] = {{5, 4, 4, 4, 3, 3, 4, 0, 0},
+                                           {4, 1, 1, 1, 1, 1, 3, 0, 0}};
+
+int AutoTuningTable<SYCL>::lpk3_3d[2][9] = {{4, 4, 3, 3, 2, 3, 4, 0, 0},
+                                           {1, 1, 1, 1, 1, 1, 2, 0, 0}};
+
+int AutoTuningTable<SYCL>::lpk1_nd[2][9] = {{2, 0, 1, 1, 1, 0, 0, 0, 0},
+                                           {0, 0, 1, 1, 1, 0, 0, 0, 0}};
+
+int AutoTuningTable<SYCL>::lpk2_nd[2][9] = {{2, 1, 3, 1, 0, 0, 0, 0, 0},
+                                           {0, 2, 1, 1, 0, 0, 0, 0, 0}};
+
+int AutoTuningTable<SYCL>::lpk3_nd[2][9] = {{2, 3, 1, 1, 0, 0, 0, 0, 0},
+                                           {0, 2, 1, 1, 0, 0, 0, 0, 0}};
+
+int AutoTuningTable<SYCL>::ipk1_3d[2][9] = {{3, 3, 4, 5, 5, 3, 4, 0, 0},
+                                           {3, 6, 4, 4, 3, 3, 3, 0, 0}};
+
+int AutoTuningTable<SYCL>::ipk2_3d[2][9] = {{3, 3, 2, 2, 2, 2, 6, 0, 0},
+                                           {2, 2, 2, 2, 2, 2, 5, 0, 0}};
+
+int AutoTuningTable<SYCL>::ipk3_3d[2][9] = {{3, 3, 2, 2, 2, 2, 1, 0, 0},
+                                           {2, 2, 2, 2, 2, 2, 6, 0, 0}};
+
+int AutoTuningTable<SYCL>::ipk1_nd[2][9] = {{0, 2, 3, 3, 0, 0, 0, 0, 0},
+                                           {0, 3, 3, 3, 0, 0, 0, 0, 0}};
+
+int AutoTuningTable<SYCL>::ipk2_nd[2][9] = {{0, 1, 2, 2, 0, 0, 0, 0, 0},
+                                           {0, 2, 2, 2, 0, 0, 0, 0, 0}};
+
+int AutoTuningTable<SYCL>::ipk3_nd[2][9] = {{0, 2, 3, 2, 0, 0, 0, 0, 0},
+                                           {0, 3, 4, 2, 0, 0, 0, 0, 0}};
+
+int AutoTuningTable<SYCL>::lwpk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
+                                        {2, 2, 3, 3, 0, 2, 5, 0, 0}};
+
+int AutoTuningTable<SYCL>::lwqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
+                                         {2, 2, 3, 3, 0, 2, 5, 0, 0}};
+
+int AutoTuningTable<SYCL>::lwdqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
+                                         {2, 2, 3, 3, 0, 2, 5, 0, 0}};  
+
+int AutoTuningTable<SYCL>::llk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0},
+                                         {2, 2, 3, 3, 0, 2, 5, 0, 0}};
+
+template void BeginAutoTuning<SYCL>();
+template void EndAutoTuning<SYCL>();
+
+} // namespace mgard_x
+// clang-format on
+#undef MGARDX_COMPILE_SYCL
\ No newline at end of file
diff --git a/src/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt b/src/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt
index 88fad48f09..488a237a84 100644
--- a/src/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt
+++ b/src/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt
@@ -13,6 +13,11 @@ if(MGARD_ENABLE_HIP)
       ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerHip.cpp)
   set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
 endif()
+if(MGARD_ENABLE_SYCL)
+  list(APPEND MGARD_X_SRC  
+      ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerSycl.cpp)
+  set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
+endif()
 if(MGARD_ENABLE_KOKKOS)
   list(APPEND MGARD_X_SRC  
       ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerKokkos.cpp)
diff --git a/src/mgard-x/RuntimeX/CMakeLists.txt b/src/mgard-x/RuntimeX/CMakeLists.txt
index 0b8f543e8b..cc3b28c1b7 100644
--- a/src/mgard-x/RuntimeX/CMakeLists.txt
+++ b/src/mgard-x/RuntimeX/CMakeLists.txt
@@ -4,4 +4,5 @@ add_subdirectory (Utilities)
 set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE)
 set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE)
 set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
+set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
 set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/RuntimeX/DeviceAdapters/CMakeLists.txt b/src/mgard-x/RuntimeX/DeviceAdapters/CMakeLists.txt
index 651543fb27..eb71bc40f5 100644
--- a/src/mgard-x/RuntimeX/DeviceAdapters/CMakeLists.txt
+++ b/src/mgard-x/RuntimeX/DeviceAdapters/CMakeLists.txt
@@ -13,6 +13,11 @@ if(MGARD_ENABLE_HIP)
       ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterHip.cpp)
   set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE)
 endif()
+if(MGARD_ENABLE_SYCL)
+  list(APPEND MGARD_X_SYCL_SRC  
+      ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterSycl.cpp)
+  set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE)
+endif()
 if(MGARD_ENABLE_KOKKOS)
   list(APPEND MGARD_X_KOKKOS_SRC  
       ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterKokkos.cpp)
diff --git a/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.cpp b/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.cpp
index 2461062814..af16904532 100644
--- a/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.cpp
+++ b/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.cpp
@@ -9,20 +9,20 @@
 
 namespace mgard_x {
 
-int DeviceRuntime<Serial>::curr_dev_id = 0;
-DeviceQueues<Serial> DeviceRuntime<Serial>::queues;
-DeviceSpecification<Serial> DeviceRuntime<Serial>::DeviceSpecs;
+int DeviceRuntime<SERIAL>::curr_dev_id = 0;
+DeviceQueues<SERIAL> DeviceRuntime<SERIAL>::queues;
+DeviceSpecification<SERIAL> DeviceRuntime<SERIAL>::DeviceSpecs;
 
-bool DeviceRuntime<Serial>::SyncAllKernelsAndCheckErrors = false;
-bool MemoryManager<Serial>::ReduceMemoryFootprint = false;
-bool DeviceRuntime<Serial>::TimingAllKernels = false;
-bool DeviceRuntime<Serial>::PrintKernelConfig = false;
+bool DeviceRuntime<SERIAL>::SyncAllKernelsAndCheckErrors = false;
+bool MemoryManager<SERIAL>::ReduceMemoryFootprint = false;
+bool DeviceRuntime<SERIAL>::TimingAllKernels = false;
+bool DeviceRuntime<SERIAL>::PrintKernelConfig = false;
 
-AutoTuningTable<Serial> AutoTuner<Serial>::autoTuningTable;
-bool AutoTuner<Serial>::ProfileKernels = false;
+AutoTuningTable<SERIAL> AutoTuner<SERIAL>::autoTuningTable;
+bool AutoTuner<SERIAL>::ProfileKernels = false;
 
-template <> bool deviceAvailable<Serial>() {
-  return DeviceRuntime<Serial>::GetDeviceCount() > 0;
+template <> bool deviceAvailable<SERIAL>() {
+  return DeviceRuntime<SERIAL>::GetDeviceCount() > 0;
 }
 
 } // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.cpp b/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.cpp
new file mode 100644
index 0000000000..f950c2dc6f
--- /dev/null
+++ b/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2022, Oak Ridge National Laboratory.
+ * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
+ * Author: Jieyang Chen (chenj3@ornl.gov)
+ * Date: March 17, 2022
+ */
+
+#include "mgard-x/RuntimeX/RuntimeX.h"
+
+namespace mgard_x {
+
+int DeviceRuntime<SYCL>::curr_dev_id = 0;
+DeviceQueues<SYCL> DeviceRuntime<SYCL>::queues;
+DeviceSpecification<SYCL> DeviceRuntime<SYCL>::DeviceSpecs;
+
+// SyncAllKernelsAndCheckErrors needs to be always ON for SYCL
+bool DeviceRuntime<SYCL>::SyncAllKernelsAndCheckErrors = true;
+bool MemoryManager<SYCL>::ReduceMemoryFootprint = false;
+bool DeviceRuntime<SYCL>::TimingAllKernels = false;
+bool DeviceRuntime<SYCL>::PrintKernelConfig = false;
+
+AutoTuningTable<SYCL> AutoTuner<SYCL>::autoTuningTable;
+bool AutoTuner<SYCL>::ProfileKernels = false;
+
+template <> bool deviceAvailable<SYCL>() {
+  return DeviceRuntime<SYCL>::GetDeviceCount() > 0;
+}
+
+} // namespace mgard_x
\ No newline at end of file
diff --git a/src/mgard-x/Utilities/CMakeLists.txt b/src/mgard-x/Utilities/CMakeLists.txt
deleted file mode 100644
index d5fa582eab..0000000000
--- a/src/mgard-x/Utilities/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-list(APPEND MGARD_X_SRC  
-      ${CMAKE_CURRENT_SOURCE_DIR}/CheckEndianess.cpp)
-set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/mgard-x/Utilities/CheckEndianess.cpp b/src/mgard-x/Utilities/CheckEndianess.cpp
deleted file mode 100644
index badd2524cf..0000000000
--- a/src/mgard-x/Utilities/CheckEndianess.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright 2022, Oak Ridge National Laboratory.
- * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs
- * Author: Jieyang Chen (chenj3@ornl.gov)
- * Date: March 17, 2022
- */
-
-#include "mgard-x/Types.h"
-
-namespace mgard_x {
-enum endiness_type CheckEndianess() {
-  int i = 1;
-  char *p = (char *)&i;
-  if (p[0] == 1) {
-    return endiness_type::Little_Endian;
-  } else {
-    return endiness_type::Big_Endian;
-  }
-}
-
-} // namespace mgard_x